gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2020 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "cgraph.h"
  44 #include "diagnostic.h"
  45 #include "insn-attr.h"
  46 #include "alias.h"
  47 #include "fold-const.h"
  48 #include "stor-layout.h"
  49 #include "calls.h"
  50 #include "varasm.h"
  51 #include "output.h"
  52 #include "flags.h"
  53 #include "explow.h"
  54 #include "expr.h"
  55 #include "reload.h"
  56 #include "langhooks.h"
  57 #include "opts.h"
  58 #include "gimplify.h"
  59 #include "dwarf2.h"
  60 #include "gimple-iterator.h"
  61 #include "tree-vectorizer.h"
  62 #include "aarch64-cost-tables.h"
  63 #include "dumpfile.h"
  64 #include "builtins.h"
  65 #include "rtl-iter.h"
  66 #include "tm-constrs.h"
  67 #include "sched-int.h"
  68 #include "target-globals.h"
  69 #include "common/common-target.h"
  70 #include "cfgrtl.h"
  71 #include "selftest.h"
  72 #include "selftest-rtl.h"
  73 #include "rtx-vector-builder.h"
  74 #include "intl.h"
  75 #include "expmed.h"
  76 #include "function-abi.h"
  77
  78 /* This file should be included last.  */
  79 #include "target-def.h"
  80
  81 /* Defined for convenience.  */
  82 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  83
  84 /* Information about a legitimate vector immediate operand.  */
  85 struct simd_immediate_info
  86 {
  87   enum insn_type { MOV, MVN, INDEX, PTRUE };
  88   enum modifier_type { LSL, MSL };
  89
  90   simd_immediate_info () {}
  91   simd_immediate_info (scalar_float_mode, rtx);
  92   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
  93                        insn_type = MOV, modifier_type = LSL,
  94                        unsigned int = 0);
  95   simd_immediate_info (scalar_mode, rtx, rtx);
  96   simd_immediate_info (scalar_int_mode, aarch64_svpattern);
  97
  98   /* The mode of the elements.  */
  99   scalar_mode elt_mode;
 100
 101   /* The instruction to use to move the immediate into a vector.  */
 102   insn_type insn;
 103
 104   union
 105   {
 106     /* For MOV and MVN.  */
 107     struct
 108     {
 109       /* The value of each element.  */
 110       rtx value;
 111
 112       /* The kind of shift modifier to use, and the number of bits to shift.
 113          This is (LSL, 0) if no shift is needed.  */
 114       modifier_type modifier;
 115       unsigned int shift;
 116     } mov;
 117
 118     /* For INDEX.  */
 119     struct
 120     {
 121       /* The value of the first element and the step to be added for each
 122          subsequent element.  */
 123       rtx base, step;
 124     } index;
 125
 126     /* For PTRUE.  */
 127     aarch64_svpattern pattern;
 128   } u;
 129 };
 130
 131 /* Construct a floating-point immediate in which each element has mode
 132    ELT_MODE_IN and value VALUE_IN.  */
 133 inline simd_immediate_info
 134 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 135   : elt_mode (elt_mode_in), insn (MOV)
 136 {
 137   u.mov.value = value_in;
 138   u.mov.modifier = LSL;
 139   u.mov.shift = 0;
 140 }
 141
 142 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 143    and value VALUE_IN.  The other parameters are as for the structure
 144    fields.  */
 145 inline simd_immediate_info
 146 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 147                        unsigned HOST_WIDE_INT value_in,
 148                        insn_type insn_in, modifier_type modifier_in,
 149                        unsigned int shift_in)
 150   : elt_mode (elt_mode_in), insn (insn_in)
 151 {
 152   u.mov.value = gen_int_mode (value_in, elt_mode_in);
 153   u.mov.modifier = modifier_in;
 154   u.mov.shift = shift_in;
 155 }
 156
 157 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 158    and where element I is equal to BASE_IN + I * STEP_IN.  */
 159 inline simd_immediate_info
 160 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
 161   : elt_mode (elt_mode_in), insn (INDEX)
 162 {
 163   u.index.base = base_in;
 164   u.index.step = step_in;
 165 }
 166
 167 /* Construct a predicate that controls elements of mode ELT_MODE_IN
 168    and has PTRUE pattern PATTERN_IN.  */
 169 inline simd_immediate_info
 170 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 171                        aarch64_svpattern pattern_in)
 172   : elt_mode (elt_mode_in), insn (PTRUE)
 173 {
 174   u.pattern = pattern_in;
 175 }
 176
 177 namespace {
 178
 179 /* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64.  */
 180 class pure_scalable_type_info
 181 {
 182 public:
 183   /* Represents the result of analyzing a type.  All values are nonzero,
 184      in the possibly forlorn hope that accidental conversions to bool
 185      trigger a warning.  */
 186   enum analysis_result
 187   {
 188     /* The type does not have an ABI identity; i.e. it doesn't contain
 189        at least one object whose type is a Fundamental Data Type.  */
 190     NO_ABI_IDENTITY = 1,
 191
 192     /* The type is definitely a Pure Scalable Type.  */
 193     IS_PST,
 194
 195     /* The type is definitely not a Pure Scalable Type.  */
 196     ISNT_PST,
 197
 198     /* It doesn't matter for PCS purposes whether the type is a Pure
 199        Scalable Type or not, since the type will be handled the same
 200        way regardless.
 201
 202        Specifically, this means that if the type is a Pure Scalable Type,
 203        there aren't enough argument registers to hold it, and so it will
 204        need to be passed or returned in memory.  If the type isn't a
 205        Pure Scalable Type, it's too big to be passed or returned in core
 206        or SIMD&FP registers, and so again will need to go in memory.  */
 207     DOESNT_MATTER
 208   };
 209
 210   /* Aggregates of 17 bytes or more are normally passed and returned
 211      in memory, so aggregates of that size can safely be analyzed as
 212      DOESNT_MATTER.  We need to be able to collect enough pieces to
 213      represent a PST that is smaller than that.  Since predicates are
 214      2 bytes in size for -msve-vector-bits=128, that means we need to be
 215      able to store at least 8 pieces.
 216
 217      We also need to be able to store enough pieces to represent
 218      a single vector in each vector argument register and a single
 219      predicate in each predicate argument register.  This means that
 220      we need at least 12 pieces.  */
 221   static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS;
 222 #if __cplusplus >= 201103L
 223   static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates");
 224 #endif
 225
 226   /* Describes one piece of a PST.  Each piece is one of:
 227
 228      - a single Scalable Vector Type (SVT)
 229      - a single Scalable Predicate Type (SPT)
 230      - a PST containing 2, 3 or 4 SVTs, with no padding
 231
 232      It either represents a single built-in type or a PST formed from
 233      multiple homogeneous built-in types.  */
 234   struct piece
 235   {
 236     rtx get_rtx (unsigned int, unsigned int) const;
 237
 238     /* The number of vector and predicate registers that the piece
 239        occupies.  One of the two is always zero.  */
 240     unsigned int num_zr;
 241     unsigned int num_pr;
 242
 243     /* The mode of the registers described above.  */
 244     machine_mode mode;
 245
 246     /* If this piece is formed from multiple homogeneous built-in types,
 247        this is the mode of the built-in types, otherwise it is MODE.  */
 248     machine_mode orig_mode;
 249
 250     /* The offset in bytes of the piece from the start of the type.  */
 251     poly_uint64_pod offset;
 252   };
 253
 254   /* Divides types analyzed as IS_PST into individual pieces.  The pieces
 255      are in memory order.  */
 256   auto_vec<piece, MAX_PIECES> pieces;
 257
 258   unsigned int num_zr () const;
 259   unsigned int num_pr () const;
 260
 261   rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const;
 262
 263   analysis_result analyze (const_tree);
 264   bool analyze_registers (const_tree);
 265
 266 private:
 267   analysis_result analyze_array (const_tree);
 268   analysis_result analyze_record (const_tree);
 269   void add_piece (const piece &);
 270 };
 271 }
 272
 273 /* The current code model.  */
 274 enum aarch64_code_model aarch64_cmodel;
 275
 276 /* The number of 64-bit elements in an SVE vector.  */
 277 poly_uint16 aarch64_sve_vg;
 278
 279 #ifdef HAVE_AS_TLS
 280 #undef TARGET_HAVE_TLS
 281 #define TARGET_HAVE_TLS 1
 282 #endif
 283
 284 static bool aarch64_composite_type_p (const_tree, machine_mode);
 285 static bool aarch64_return_in_memory_1 (const_tree);
 286 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 287                                                      const_tree,
 288                                                      machine_mode *, int *,
 289                                                      bool *);
 290 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 291 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 292 static void aarch64_override_options_after_change (void);
 293 static bool aarch64_vector_mode_supported_p (machine_mode);
 294 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 295 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 296                                                          const_tree type,
 297                                                          int misalignment,
 298                                                          bool is_packed);
 299 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 300 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 301                                             aarch64_addr_query_type);
 302 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
 303
 304 /* Major revision number of the ARM Architecture implemented by the target.  */
 305 unsigned aarch64_architecture_version;
 306
 307 /* The processor for which instructions should be scheduled.  */
 308 enum aarch64_processor aarch64_tune = cortexa53;
 309
 310 /* Mask to specify which instruction scheduling options should be used.  */
 311 uint64_t aarch64_tune_flags = 0;
 312
 313 /* Global flag for PC relative loads.  */
 314 bool aarch64_pcrelative_literal_loads;
 315
 316 /* Global flag for whether frame pointer is enabled.  */
 317 bool aarch64_use_frame_pointer;
 318
 319 #define BRANCH_PROTECT_STR_MAX 255
 320 char *accepted_branch_protection_string = NULL;
 321
 322 static enum aarch64_parse_opt_result
 323 aarch64_parse_branch_protection (const char*, char**);
 324
 325 /* Support for command line parsing of boolean flags in the tuning
 326    structures.  */
 327 struct aarch64_flag_desc
 328 {
 329   const char* name;
 330   unsigned int flag;
 331 };
 332
 333 #define AARCH64_FUSION_PAIR(name, internal_name) \
 334   { name, AARCH64_FUSE_##internal_name },
 335 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 336 {
 337   { "none", AARCH64_FUSE_NOTHING },
 338 #include "aarch64-fusion-pairs.def"
 339   { "all", AARCH64_FUSE_ALL },
 340   { NULL, AARCH64_FUSE_NOTHING }
 341 };
 342
 343 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 344   { name, AARCH64_EXTRA_TUNE_##internal_name },
 345 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 346 {
 347   { "none", AARCH64_EXTRA_TUNE_NONE },
 348 #include "aarch64-tuning-flags.def"
 349   { "all", AARCH64_EXTRA_TUNE_ALL },
 350   { NULL, AARCH64_EXTRA_TUNE_NONE }
 351 };
 352
 353 /* Tuning parameters.  */
 354
 355 static const struct cpu_addrcost_table generic_addrcost_table =
 356 {
 357     {
 358       1, /* hi  */
 359       0, /* si  */
 360       0, /* di  */
 361       1, /* ti  */
 362     },
 363   0, /* pre_modify  */
 364   0, /* post_modify  */
 365   0, /* register_offset  */
 366   0, /* register_sextend  */
 367   0, /* register_zextend  */
 368   0 /* imm_offset  */
 369 };
 370
 371 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 372 {
 373     {
 374       0, /* hi  */
 375       0, /* si  */
 376       0, /* di  */
 377       2, /* ti  */
 378     },
 379   0, /* pre_modify  */
 380   0, /* post_modify  */
 381   1, /* register_offset  */
 382   1, /* register_sextend  */
 383   2, /* register_zextend  */
 384   0, /* imm_offset  */
 385 };
 386
 387 static const struct cpu_addrcost_table xgene1_addrcost_table =
 388 {
 389     {
 390       1, /* hi  */
 391       0, /* si  */
 392       0, /* di  */
 393       1, /* ti  */
 394     },
 395   1, /* pre_modify  */
 396   1, /* post_modify  */
 397   0, /* register_offset  */
 398   1, /* register_sextend  */
 399   1, /* register_zextend  */
 400   0, /* imm_offset  */
 401 };
 402
 403 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 404 {
 405     {
 406       1, /* hi  */
 407       1, /* si  */
 408       1, /* di  */
 409       2, /* ti  */
 410     },
 411   0, /* pre_modify  */
 412   0, /* post_modify  */
 413   2, /* register_offset  */
 414   3, /* register_sextend  */
 415   3, /* register_zextend  */
 416   0, /* imm_offset  */
 417 };
 418
 419 static const struct cpu_addrcost_table tsv110_addrcost_table =
 420 {
 421     {
 422       1, /* hi  */
 423       0, /* si  */
 424       0, /* di  */
 425       1, /* ti  */
 426     },
 427   0, /* pre_modify  */
 428   0, /* post_modify  */
 429   0, /* register_offset  */
 430   1, /* register_sextend  */
 431   1, /* register_zextend  */
 432   0, /* imm_offset  */
 433 };
 434
 435 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 436 {
 437     {
 438       1, /* hi  */
 439       1, /* si  */
 440       1, /* di  */
 441       2, /* ti  */
 442     },
 443   1, /* pre_modify  */
 444   1, /* post_modify  */
 445   3, /* register_offset  */
 446   3, /* register_sextend  */
 447   3, /* register_zextend  */
 448   2, /* imm_offset  */
 449 };
 450
 451 static const struct cpu_regmove_cost generic_regmove_cost =
 452 {
 453   1, /* GP2GP  */
 454   /* Avoid the use of slow int<->fp moves for spilling by setting
 455      their cost higher than memmov_cost.  */
 456   5, /* GP2FP  */
 457   5, /* FP2GP  */
 458   2 /* FP2FP  */
 459 };
 460
 461 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 462 {
 463   1, /* GP2GP  */
 464   /* Avoid the use of slow int<->fp moves for spilling by setting
 465      their cost higher than memmov_cost.  */
 466   5, /* GP2FP  */
 467   5, /* FP2GP  */
 468   2 /* FP2FP  */
 469 };
 470
 471 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 472 {
 473   1, /* GP2GP  */
 474   /* Avoid the use of slow int<->fp moves for spilling by setting
 475      their cost higher than memmov_cost.  */
 476   5, /* GP2FP  */
 477   5, /* FP2GP  */
 478   2 /* FP2FP  */
 479 };
 480
 481 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 482 {
 483   1, /* GP2GP  */
 484   /* Avoid the use of slow int<->fp moves for spilling by setting
 485      their cost higher than memmov_cost (actual, 4 and 9).  */
 486   9, /* GP2FP  */
 487   9, /* FP2GP  */
 488   1 /* FP2FP  */
 489 };
 490
 491 static const struct cpu_regmove_cost thunderx_regmove_cost =
 492 {
 493   2, /* GP2GP  */
 494   2, /* GP2FP  */
 495   6, /* FP2GP  */
 496   4 /* FP2FP  */
 497 };
 498
 499 static const struct cpu_regmove_cost xgene1_regmove_cost =
 500 {
 501   1, /* GP2GP  */
 502   /* Avoid the use of slow int<->fp moves for spilling by setting
 503      their cost higher than memmov_cost.  */
 504   8, /* GP2FP  */
 505   8, /* FP2GP  */
 506   2 /* FP2FP  */
 507 };
 508
 509 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 510 {
 511   2, /* GP2GP  */
 512   /* Avoid the use of int<->fp moves for spilling.  */
 513   6, /* GP2FP  */
 514   6, /* FP2GP  */
 515   4 /* FP2FP  */
 516 };
 517
 518 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 519 {
 520   1, /* GP2GP  */
 521   /* Avoid the use of int<->fp moves for spilling.  */
 522   8, /* GP2FP  */
 523   8, /* FP2GP  */
 524   4  /* FP2FP  */
 525 };
 526
 527 static const struct cpu_regmove_cost tsv110_regmove_cost =
 528 {
 529   1, /* GP2GP  */
 530   /* Avoid the use of slow int<->fp moves for spilling by setting
 531      their cost higher than memmov_cost.  */
 532   2, /* GP2FP  */
 533   3, /* FP2GP  */
 534   2  /* FP2FP  */
 535 };
 536
 537 /* Generic costs for vector insn classes.  */
 538 static const struct cpu_vector_cost generic_vector_cost =
 539 {
 540   1, /* scalar_int_stmt_cost  */
 541   1, /* scalar_fp_stmt_cost  */
 542   1, /* scalar_load_cost  */
 543   1, /* scalar_store_cost  */
 544   1, /* vec_int_stmt_cost  */
 545   1, /* vec_fp_stmt_cost  */
 546   2, /* vec_permute_cost  */
 547   2, /* vec_to_scalar_cost  */
 548   1, /* scalar_to_vec_cost  */
 549   1, /* vec_align_load_cost  */
 550   1, /* vec_unalign_load_cost  */
 551   1, /* vec_unalign_store_cost  */
 552   1, /* vec_store_cost  */
 553   3, /* cond_taken_branch_cost  */
 554   1 /* cond_not_taken_branch_cost  */
 555 };
 556
 557 /* QDF24XX costs for vector insn classes.  */
 558 static const struct cpu_vector_cost qdf24xx_vector_cost =
 559 {
 560   1, /* scalar_int_stmt_cost  */
 561   1, /* scalar_fp_stmt_cost  */
 562   1, /* scalar_load_cost  */
 563   1, /* scalar_store_cost  */
 564   1, /* vec_int_stmt_cost  */
 565   3, /* vec_fp_stmt_cost  */
 566   2, /* vec_permute_cost  */
 567   1, /* vec_to_scalar_cost  */
 568   1, /* scalar_to_vec_cost  */
 569   1, /* vec_align_load_cost  */
 570   1, /* vec_unalign_load_cost  */
 571   1, /* vec_unalign_store_cost  */
 572   1, /* vec_store_cost  */
 573   3, /* cond_taken_branch_cost  */
 574   1 /* cond_not_taken_branch_cost  */
 575 };
 576
 577 /* ThunderX costs for vector insn classes.  */
 578 static const struct cpu_vector_cost thunderx_vector_cost =
 579 {
 580   1, /* scalar_int_stmt_cost  */
 581   1, /* scalar_fp_stmt_cost  */
 582   3, /* scalar_load_cost  */
 583   1, /* scalar_store_cost  */
 584   4, /* vec_int_stmt_cost  */
 585   1, /* vec_fp_stmt_cost  */
 586   4, /* vec_permute_cost  */
 587   2, /* vec_to_scalar_cost  */
 588   2, /* scalar_to_vec_cost  */
 589   3, /* vec_align_load_cost  */
 590   5, /* vec_unalign_load_cost  */
 591   5, /* vec_unalign_store_cost  */
 592   1, /* vec_store_cost  */
 593   3, /* cond_taken_branch_cost  */
 594   3 /* cond_not_taken_branch_cost  */
 595 };
 596
 597 static const struct cpu_vector_cost tsv110_vector_cost =
 598 {
 599   1, /* scalar_int_stmt_cost  */
 600   1, /* scalar_fp_stmt_cost  */
 601   5, /* scalar_load_cost  */
 602   1, /* scalar_store_cost  */
 603   2, /* vec_int_stmt_cost  */
 604   2, /* vec_fp_stmt_cost  */
 605   2, /* vec_permute_cost  */
 606   3, /* vec_to_scalar_cost  */
 607   2, /* scalar_to_vec_cost  */
 608   5, /* vec_align_load_cost  */
 609   5, /* vec_unalign_load_cost  */
 610   1, /* vec_unalign_store_cost  */
 611   1, /* vec_store_cost  */
 612   1, /* cond_taken_branch_cost  */
 613   1 /* cond_not_taken_branch_cost  */
 614 };
 615
 616 /* Generic costs for vector insn classes.  */
 617 static const struct cpu_vector_cost cortexa57_vector_cost =
 618 {
 619   1, /* scalar_int_stmt_cost  */
 620   1, /* scalar_fp_stmt_cost  */
 621   4, /* scalar_load_cost  */
 622   1, /* scalar_store_cost  */
 623   2, /* vec_int_stmt_cost  */
 624   2, /* vec_fp_stmt_cost  */
 625   3, /* vec_permute_cost  */
 626   8, /* vec_to_scalar_cost  */
 627   8, /* scalar_to_vec_cost  */
 628   4, /* vec_align_load_cost  */
 629   4, /* vec_unalign_load_cost  */
 630   1, /* vec_unalign_store_cost  */
 631   1, /* vec_store_cost  */
 632   1, /* cond_taken_branch_cost  */
 633   1 /* cond_not_taken_branch_cost  */
 634 };
 635
 636 static const struct cpu_vector_cost exynosm1_vector_cost =
 637 {
 638   1, /* scalar_int_stmt_cost  */
 639   1, /* scalar_fp_stmt_cost  */
 640   5, /* scalar_load_cost  */
 641   1, /* scalar_store_cost  */
 642   3, /* vec_int_stmt_cost  */
 643   3, /* vec_fp_stmt_cost  */
 644   3, /* vec_permute_cost  */
 645   3, /* vec_to_scalar_cost  */
 646   3, /* scalar_to_vec_cost  */
 647   5, /* vec_align_load_cost  */
 648   5, /* vec_unalign_load_cost  */
 649   1, /* vec_unalign_store_cost  */
 650   1, /* vec_store_cost  */
 651   1, /* cond_taken_branch_cost  */
 652   1 /* cond_not_taken_branch_cost  */
 653 };
 654
 655 /* Generic costs for vector insn classes.  */
 656 static const struct cpu_vector_cost xgene1_vector_cost =
 657 {
 658   1, /* scalar_int_stmt_cost  */
 659   1, /* scalar_fp_stmt_cost  */
 660   5, /* scalar_load_cost  */
 661   1, /* scalar_store_cost  */
 662   2, /* vec_int_stmt_cost  */
 663   2, /* vec_fp_stmt_cost  */
 664   2, /* vec_permute_cost  */
 665   4, /* vec_to_scalar_cost  */
 666   4, /* scalar_to_vec_cost  */
 667   10, /* vec_align_load_cost  */
 668   10, /* vec_unalign_load_cost  */
 669   2, /* vec_unalign_store_cost  */
 670   2, /* vec_store_cost  */
 671   2, /* cond_taken_branch_cost  */
 672   1 /* cond_not_taken_branch_cost  */
 673 };
 674
 675 /* Costs for vector insn classes for Vulcan.  */
 676 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 677 {
 678   1, /* scalar_int_stmt_cost  */
 679   6, /* scalar_fp_stmt_cost  */
 680   4, /* scalar_load_cost  */
 681   1, /* scalar_store_cost  */
 682   5, /* vec_int_stmt_cost  */
 683   6, /* vec_fp_stmt_cost  */
 684   10, /* vec_permute_cost  */
 685   6, /* vec_to_scalar_cost  */
 686   5, /* scalar_to_vec_cost  */
 687   8, /* vec_align_load_cost  */
 688   8, /* vec_unalign_load_cost  */
 689   4, /* vec_unalign_store_cost  */
 690   4, /* vec_store_cost  */
 691   2, /* cond_taken_branch_cost  */
 692   1  /* cond_not_taken_branch_cost  */
 693 };
 694
 695 /* Generic costs for branch instructions.  */
 696 static const struct cpu_branch_cost generic_branch_cost =
 697 {
 698   1,  /* Predictable.  */
 699   3   /* Unpredictable.  */
 700 };
 701
 702 /* Generic approximation modes.  */
 703 static const cpu_approx_modes generic_approx_modes =
 704 {
 705   AARCH64_APPROX_NONE,  /* division  */
 706   AARCH64_APPROX_NONE,  /* sqrt  */
 707   AARCH64_APPROX_NONE   /* recip_sqrt  */
 708 };
 709
 710 /* Approximation modes for Exynos M1.  */
 711 static const cpu_approx_modes exynosm1_approx_modes =
 712 {
 713   AARCH64_APPROX_NONE,  /* division  */
 714   AARCH64_APPROX_ALL,   /* sqrt  */
 715   AARCH64_APPROX_ALL    /* recip_sqrt  */
 716 };
 717
 718 /* Approximation modes for X-Gene 1.  */
 719 static const cpu_approx_modes xgene1_approx_modes =
 720 {
 721   AARCH64_APPROX_NONE,  /* division  */
 722   AARCH64_APPROX_NONE,  /* sqrt  */
 723   AARCH64_APPROX_ALL    /* recip_sqrt  */
 724 };
 725
 726 /* Generic prefetch settings (which disable prefetch).  */
 727 static const cpu_prefetch_tune generic_prefetch_tune =
 728 {
 729   0,                    /* num_slots  */
 730   -1,                   /* l1_cache_size  */
 731   -1,                   /* l1_cache_line_size  */
 732   -1,                   /* l2_cache_size  */
 733   true,                 /* prefetch_dynamic_strides */
 734   -1,                   /* minimum_stride */
 735   -1                    /* default_opt_level  */
 736 };
 737
 738 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 739 {
 740   0,                    /* num_slots  */
 741   -1,                   /* l1_cache_size  */
 742   64,                   /* l1_cache_line_size  */
 743   -1,                   /* l2_cache_size  */
 744   true,                 /* prefetch_dynamic_strides */
 745   -1,                   /* minimum_stride */
 746   -1                    /* default_opt_level  */
 747 };
 748
 749 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 750 {
 751   4,                    /* num_slots  */
 752   32,                   /* l1_cache_size  */
 753   64,                   /* l1_cache_line_size  */
 754   512,                  /* l2_cache_size  */
 755   false,                /* prefetch_dynamic_strides */
 756   2048,                 /* minimum_stride */
 757   3                     /* default_opt_level  */
 758 };
 759
 760 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 761 {
 762   8,                    /* num_slots  */
 763   32,                   /* l1_cache_size  */
 764   128,                  /* l1_cache_line_size  */
 765   16*1024,              /* l2_cache_size  */
 766   true,                 /* prefetch_dynamic_strides */
 767   -1,                   /* minimum_stride */
 768   3                     /* default_opt_level  */
 769 };
 770
 771 static const cpu_prefetch_tune thunderx_prefetch_tune =
 772 {
 773   8,                    /* num_slots  */
 774   32,                   /* l1_cache_size  */
 775   128,                  /* l1_cache_line_size  */
 776   -1,                   /* l2_cache_size  */
 777   true,                 /* prefetch_dynamic_strides */
 778   -1,                   /* minimum_stride */
 779   -1                    /* default_opt_level  */
 780 };
 781
 782 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 783 {
 784   8,                    /* num_slots  */
 785   32,                   /* l1_cache_size  */
 786   64,                   /* l1_cache_line_size  */
 787   256,                  /* l2_cache_size  */
 788   true,                 /* prefetch_dynamic_strides */
 789   -1,                   /* minimum_stride */
 790   -1                    /* default_opt_level  */
 791 };
 792
 793 static const cpu_prefetch_tune tsv110_prefetch_tune =
 794 {
 795   0,                    /* num_slots  */
 796   64,                   /* l1_cache_size  */
 797   64,                   /* l1_cache_line_size  */
 798   512,                  /* l2_cache_size  */
 799   true,                 /* prefetch_dynamic_strides */
 800   -1,                   /* minimum_stride */
 801   -1                    /* default_opt_level  */
 802 };
 803
 804 static const cpu_prefetch_tune xgene1_prefetch_tune =
 805 {
 806   8,                    /* num_slots  */
 807   32,                   /* l1_cache_size  */
 808   64,                   /* l1_cache_line_size  */
 809   256,                  /* l2_cache_size  */
 810   true,                 /* prefetch_dynamic_strides */
 811   -1,                   /* minimum_stride */
 812   -1                    /* default_opt_level  */
 813 };
 814
 815 static const struct tune_params generic_tunings =
 816 {
 817   &cortexa57_extra_costs,
 818   &generic_addrcost_table,
 819   &generic_regmove_cost,
 820   &generic_vector_cost,
 821   &generic_branch_cost,
 822   &generic_approx_modes,
 823   SVE_NOT_IMPLEMENTED, /* sve_width  */
 824   4, /* memmov_cost  */
 825   2, /* issue_rate  */
 826   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
 827   "16:12",      /* function_align.  */
 828   "4",  /* jump_align.  */
 829   "8",  /* loop_align.  */
 830   2,    /* int_reassoc_width.  */
 831   4,    /* fp_reassoc_width.  */
 832   1,    /* vec_reassoc_width.  */
 833   2,    /* min_div_recip_mul_sf.  */
 834   2,    /* min_div_recip_mul_df.  */
 835   0,    /* max_case_values.  */
 836   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 837   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 838   &generic_prefetch_tune
 839 };
 840
 841 static const struct tune_params cortexa35_tunings =
 842 {
 843   &cortexa53_extra_costs,
 844   &generic_addrcost_table,
 845   &cortexa53_regmove_cost,
 846   &generic_vector_cost,
 847   &generic_branch_cost,
 848   &generic_approx_modes,
 849   SVE_NOT_IMPLEMENTED, /* sve_width  */
 850   4, /* memmov_cost  */
 851   1, /* issue_rate  */
 852   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 853    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 854   "16", /* function_align.  */
 855   "4",  /* jump_align.  */
 856   "8",  /* loop_align.  */
 857   2,    /* int_reassoc_width.  */
 858   4,    /* fp_reassoc_width.  */
 859   1,    /* vec_reassoc_width.  */
 860   2,    /* min_div_recip_mul_sf.  */
 861   2,    /* min_div_recip_mul_df.  */
 862   0,    /* max_case_values.  */
 863   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 864   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 865   &generic_prefetch_tune
 866 };
 867
 868 static const struct tune_params cortexa53_tunings =
 869 {
 870   &cortexa53_extra_costs,
 871   &generic_addrcost_table,
 872   &cortexa53_regmove_cost,
 873   &generic_vector_cost,
 874   &generic_branch_cost,
 875   &generic_approx_modes,
 876   SVE_NOT_IMPLEMENTED, /* sve_width  */
 877   4, /* memmov_cost  */
 878   2, /* issue_rate  */
 879   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 880    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 881   "16", /* function_align.  */
 882   "4",  /* jump_align.  */
 883   "8",  /* loop_align.  */
 884   2,    /* int_reassoc_width.  */
 885   4,    /* fp_reassoc_width.  */
 886   1,    /* vec_reassoc_width.  */
 887   2,    /* min_div_recip_mul_sf.  */
 888   2,    /* min_div_recip_mul_df.  */
 889   0,    /* max_case_values.  */
 890   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 891   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 892   &generic_prefetch_tune
 893 };
 894
 895 static const struct tune_params cortexa57_tunings =
 896 {
 897   &cortexa57_extra_costs,
 898   &generic_addrcost_table,
 899   &cortexa57_regmove_cost,
 900   &cortexa57_vector_cost,
 901   &generic_branch_cost,
 902   &generic_approx_modes,
 903   SVE_NOT_IMPLEMENTED, /* sve_width  */
 904   4, /* memmov_cost  */
 905   3, /* issue_rate  */
 906   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 907    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 908   "16", /* function_align.  */
 909   "4",  /* jump_align.  */
 910   "8",  /* loop_align.  */
 911   2,    /* int_reassoc_width.  */
 912   4,    /* fp_reassoc_width.  */
 913   1,    /* vec_reassoc_width.  */
 914   2,    /* min_div_recip_mul_sf.  */
 915   2,    /* min_div_recip_mul_df.  */
 916   0,    /* max_case_values.  */
 917   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 918   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 919   &generic_prefetch_tune
 920 };
 921
 922 static const struct tune_params cortexa72_tunings =
 923 {
 924   &cortexa57_extra_costs,
 925   &generic_addrcost_table,
 926   &cortexa57_regmove_cost,
 927   &cortexa57_vector_cost,
 928   &generic_branch_cost,
 929   &generic_approx_modes,
 930   SVE_NOT_IMPLEMENTED, /* sve_width  */
 931   4, /* memmov_cost  */
 932   3, /* issue_rate  */
 933   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 934    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 935   "16", /* function_align.  */
 936   "4",  /* jump_align.  */
 937   "8",  /* loop_align.  */
 938   2,    /* int_reassoc_width.  */
 939   4,    /* fp_reassoc_width.  */
 940   1,    /* vec_reassoc_width.  */
 941   2,    /* min_div_recip_mul_sf.  */
 942   2,    /* min_div_recip_mul_df.  */
 943   0,    /* max_case_values.  */
 944   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 945   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 946   &generic_prefetch_tune
 947 };
 948
 949 static const struct tune_params cortexa73_tunings =
 950 {
 951   &cortexa57_extra_costs,
 952   &generic_addrcost_table,
 953   &cortexa57_regmove_cost,
 954   &cortexa57_vector_cost,
 955   &generic_branch_cost,
 956   &generic_approx_modes,
 957   SVE_NOT_IMPLEMENTED, /* sve_width  */
 958   4, /* memmov_cost.  */
 959   2, /* issue_rate.  */
 960   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 961    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 962   "16", /* function_align.  */
 963   "4",  /* jump_align.  */
 964   "8",  /* loop_align.  */
 965   2,    /* int_reassoc_width.  */
 966   4,    /* fp_reassoc_width.  */
 967   1,    /* vec_reassoc_width.  */
 968   2,    /* min_div_recip_mul_sf.  */
 969   2,    /* min_div_recip_mul_df.  */
 970   0,    /* max_case_values.  */
 971   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 972   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 973   &generic_prefetch_tune
 974 };
 975
 976
 977
 978 static const struct tune_params exynosm1_tunings =
 979 {
 980   &exynosm1_extra_costs,
 981   &exynosm1_addrcost_table,
 982   &exynosm1_regmove_cost,
 983   &exynosm1_vector_cost,
 984   &generic_branch_cost,
 985   &exynosm1_approx_modes,
 986   SVE_NOT_IMPLEMENTED, /* sve_width  */
 987   4,    /* memmov_cost  */
 988   3,    /* issue_rate  */
 989   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 990   "4",  /* function_align.  */
 991   "4",  /* jump_align.  */
 992   "4",  /* loop_align.  */
 993   2,    /* int_reassoc_width.  */
 994   4,    /* fp_reassoc_width.  */
 995   1,    /* vec_reassoc_width.  */
 996   2,    /* min_div_recip_mul_sf.  */
 997   2,    /* min_div_recip_mul_df.  */
 998   48,   /* max_case_values.  */
 999   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
1000   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
1001   &exynosm1_prefetch_tune
1002 };
1003
1004 static const struct tune_params thunderxt88_tunings =
1005 {
1006   &thunderx_extra_costs,
1007   &generic_addrcost_table,
1008   &thunderx_regmove_cost,
1009   &thunderx_vector_cost,
1010   &generic_branch_cost,
1011   &generic_approx_modes,
1012   SVE_NOT_IMPLEMENTED, /* sve_width  */
1013   6, /* memmov_cost  */
1014   2, /* issue_rate  */
1015   AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
1016   "8",  /* function_align.  */
1017   "8",  /* jump_align.  */
1018   "8",  /* loop_align.  */
1019   2,    /* int_reassoc_width.  */
1020   4,    /* fp_reassoc_width.  */
1021   1,    /* vec_reassoc_width.  */
1022   2,    /* min_div_recip_mul_sf.  */
1023   2,    /* min_div_recip_mul_df.  */
1024   0,    /* max_case_values.  */
1025   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1026   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
1027   &thunderxt88_prefetch_tune
1028 };
1029
1030 static const struct tune_params thunderx_tunings =
1031 {
1032   &thunderx_extra_costs,
1033   &generic_addrcost_table,
1034   &thunderx_regmove_cost,
1035   &thunderx_vector_cost,
1036   &generic_branch_cost,
1037   &generic_approx_modes,
1038   SVE_NOT_IMPLEMENTED, /* sve_width  */
1039   6, /* memmov_cost  */
1040   2, /* issue_rate  */
1041   AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
1042   "8",  /* function_align.  */
1043   "8",  /* jump_align.  */
1044   "8",  /* loop_align.  */
1045   2,    /* int_reassoc_width.  */
1046   4,    /* fp_reassoc_width.  */
1047   1,    /* vec_reassoc_width.  */
1048   2,    /* min_div_recip_mul_sf.  */
1049   2,    /* min_div_recip_mul_df.  */
1050   0,    /* max_case_values.  */
1051   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1052   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
1053    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
1054   &thunderx_prefetch_tune
1055 };
1056
1057 static const struct tune_params tsv110_tunings =
1058 {
1059   &tsv110_extra_costs,
1060   &tsv110_addrcost_table,
1061   &tsv110_regmove_cost,
1062   &tsv110_vector_cost,
1063   &generic_branch_cost,
1064   &generic_approx_modes,
1065   SVE_NOT_IMPLEMENTED, /* sve_width  */
1066   4,    /* memmov_cost  */
1067   4,    /* issue_rate  */
1068   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
1069    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1070   "16", /* function_align.  */
1071   "4",  /* jump_align.  */
1072   "8",  /* loop_align.  */
1073   2,    /* int_reassoc_width.  */
1074   4,    /* fp_reassoc_width.  */
1075   1,    /* vec_reassoc_width.  */
1076   2,    /* min_div_recip_mul_sf.  */
1077   2,    /* min_div_recip_mul_df.  */
1078   0,    /* max_case_values.  */
1079   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1080   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
1081   &tsv110_prefetch_tune
1082 };
1083
1084 static const struct tune_params xgene1_tunings =
1085 {
1086   &xgene1_extra_costs,
1087   &xgene1_addrcost_table,
1088   &xgene1_regmove_cost,
1089   &xgene1_vector_cost,
1090   &generic_branch_cost,
1091   &xgene1_approx_modes,
1092   SVE_NOT_IMPLEMENTED, /* sve_width  */
1093   6, /* memmov_cost  */
1094   4, /* issue_rate  */
1095   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1096   "16", /* function_align.  */
1097   "16", /* jump_align.  */
1098   "16", /* loop_align.  */
1099   2,    /* int_reassoc_width.  */
1100   4,    /* fp_reassoc_width.  */
1101   1,    /* vec_reassoc_width.  */
1102   2,    /* min_div_recip_mul_sf.  */
1103   2,    /* min_div_recip_mul_df.  */
1104   17,   /* max_case_values.  */
1105   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1106   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1107   &xgene1_prefetch_tune
1108 };
1109
1110 static const struct tune_params emag_tunings =
1111 {
1112   &xgene1_extra_costs,
1113   &xgene1_addrcost_table,
1114   &xgene1_regmove_cost,
1115   &xgene1_vector_cost,
1116   &generic_branch_cost,
1117   &xgene1_approx_modes,
1118   SVE_NOT_IMPLEMENTED,
1119   6, /* memmov_cost  */
1120   4, /* issue_rate  */
1121   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1122   "16", /* function_align.  */
1123   "16", /* jump_align.  */
1124   "16", /* loop_align.  */
1125   2,    /* int_reassoc_width.  */
1126   4,    /* fp_reassoc_width.  */
1127   1,    /* vec_reassoc_width.  */
1128   2,    /* min_div_recip_mul_sf.  */
1129   2,    /* min_div_recip_mul_df.  */
1130   17,   /* max_case_values.  */
1131   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1132   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1133   &xgene1_prefetch_tune
1134 };
1135
1136 static const struct tune_params qdf24xx_tunings =
1137 {
1138   &qdf24xx_extra_costs,
1139   &qdf24xx_addrcost_table,
1140   &qdf24xx_regmove_cost,
1141   &qdf24xx_vector_cost,
1142   &generic_branch_cost,
1143   &generic_approx_modes,
1144   SVE_NOT_IMPLEMENTED, /* sve_width  */
1145   4, /* memmov_cost  */
1146   4, /* issue_rate  */
1147   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1148    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1149   "16", /* function_align.  */
1150   "8",  /* jump_align.  */
1151   "16", /* loop_align.  */
1152   2,    /* int_reassoc_width.  */
1153   4,    /* fp_reassoc_width.  */
1154   1,    /* vec_reassoc_width.  */
1155   2,    /* min_div_recip_mul_sf.  */
1156   2,    /* min_div_recip_mul_df.  */
1157   0,    /* max_case_values.  */
1158   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1159   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1160   &qdf24xx_prefetch_tune
1161 };
1162
1163 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1164    for now.  */
1165 static const struct tune_params saphira_tunings =
1166 {
1167   &generic_extra_costs,
1168   &generic_addrcost_table,
1169   &generic_regmove_cost,
1170   &generic_vector_cost,
1171   &generic_branch_cost,
1172   &generic_approx_modes,
1173   SVE_NOT_IMPLEMENTED, /* sve_width  */
1174   4, /* memmov_cost  */
1175   4, /* issue_rate  */
1176   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1177    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1178   "16", /* function_align.  */
1179   "8",  /* jump_align.  */
1180   "16", /* loop_align.  */
1181   2,    /* int_reassoc_width.  */
1182   4,    /* fp_reassoc_width.  */
1183   1,    /* vec_reassoc_width.  */
1184   2,    /* min_div_recip_mul_sf.  */
1185   2,    /* min_div_recip_mul_df.  */
1186   0,    /* max_case_values.  */
1187   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1188   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1189   &generic_prefetch_tune
1190 };
1191
1192 static const struct tune_params thunderx2t99_tunings =
1193 {
1194   &thunderx2t99_extra_costs,
1195   &thunderx2t99_addrcost_table,
1196   &thunderx2t99_regmove_cost,
1197   &thunderx2t99_vector_cost,
1198   &generic_branch_cost,
1199   &generic_approx_modes,
1200   SVE_NOT_IMPLEMENTED, /* sve_width  */
1201   4, /* memmov_cost.  */
1202   4, /* issue_rate.  */
1203   (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1204    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1205   "16", /* function_align.  */
1206   "8",  /* jump_align.  */
1207   "16", /* loop_align.  */
1208   3,    /* int_reassoc_width.  */
1209   2,    /* fp_reassoc_width.  */
1210   2,    /* vec_reassoc_width.  */
1211   2,    /* min_div_recip_mul_sf.  */
1212   2,    /* min_div_recip_mul_df.  */
1213   0,    /* max_case_values.  */
1214   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1215   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1216   &thunderx2t99_prefetch_tune
1217 };
1218
1219 static const struct tune_params neoversen1_tunings =
1220 {
1221   &cortexa57_extra_costs,
1222   &generic_addrcost_table,
1223   &generic_regmove_cost,
1224   &cortexa57_vector_cost,
1225   &generic_branch_cost,
1226   &generic_approx_modes,
1227   SVE_NOT_IMPLEMENTED, /* sve_width  */
1228   4, /* memmov_cost  */
1229   3, /* issue_rate  */
1230   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
1231   "32:16",      /* function_align.  */
1232   "4",          /* jump_align.  */
1233   "32:16",      /* loop_align.  */
1234   2,    /* int_reassoc_width.  */
1235   4,    /* fp_reassoc_width.  */
1236   2,    /* vec_reassoc_width.  */
1237   2,    /* min_div_recip_mul_sf.  */
1238   2,    /* min_div_recip_mul_df.  */
1239   0,    /* max_case_values.  */
1240   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1241   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1242   &generic_prefetch_tune
1243 };
1244
1245 /* Support for fine-grained override of the tuning structures.  */
1246 struct aarch64_tuning_override_function
1247 {
1248   const char* name;
1249   void (*parse_override)(const char*, struct tune_params*);
1250 };
1251
1252 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1253 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1254 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1255
1256 static const struct aarch64_tuning_override_function
1257 aarch64_tuning_override_functions[] =
1258 {
1259   { "fuse", aarch64_parse_fuse_string },
1260   { "tune", aarch64_parse_tune_string },
1261   { "sve_width", aarch64_parse_sve_width_string },
1262   { NULL, NULL }
1263 };
1264
1265 /* A processor implementing AArch64.  */
1266 struct processor
1267 {
1268   const char *const name;
1269   enum aarch64_processor ident;
1270   enum aarch64_processor sched_core;
1271   enum aarch64_arch arch;
1272   unsigned architecture_version;
1273   const uint64_t flags;
1274   const struct tune_params *const tune;
1275 };
1276
1277 /* Architectures implementing AArch64.  */
1278 static const struct processor all_architectures[] =
1279 {
1280 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1281   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1282 #include "aarch64-arches.def"
1283   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1284 };
1285
1286 /* Processor cores implementing AArch64.  */
1287 static const struct processor all_cores[] =
1288 {
1289 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1290   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
1291   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
1292   FLAGS, &COSTS##_tunings},
1293 #include "aarch64-cores.def"
1294   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1295     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1296   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1297 };
1298
1299
1300 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1301    handling code or by target attributes.  */
1302 static const struct processor *selected_arch;
1303 static const struct processor *selected_cpu;
1304 static const struct processor *selected_tune;
1305
1306 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1307
1308 /* The current tuning set.  */
1309 struct tune_params aarch64_tune_params = generic_tunings;
1310
1311 /* Check whether an 'aarch64_vector_pcs' attribute is valid.  */
1312
1313 static tree
1314 handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
1315                                      int, bool *no_add_attrs)
1316 {
1317   /* Since we set fn_type_req to true, the caller should have checked
1318      this for us.  */
1319   gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
1320   switch ((arm_pcs) fntype_abi (*node).id ())
1321     {
1322     case ARM_PCS_AAPCS64:
1323     case ARM_PCS_SIMD:
1324       return NULL_TREE;
1325
1326     case ARM_PCS_SVE:
1327       error ("the %qE attribute cannot be applied to an SVE function type",
1328              name);
1329       *no_add_attrs = true;
1330       return NULL_TREE;
1331
1332     case ARM_PCS_TLSDESC:
1333     case ARM_PCS_UNKNOWN:
1334       break;
1335     }
1336   gcc_unreachable ();
1337 }
1338
1339 /* Table of machine attributes.  */
1340 static const struct attribute_spec aarch64_attribute_table[] =
1341 {
1342   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1343        affects_type_identity, handler, exclude } */
1344   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,
1345                           handle_aarch64_vector_pcs_attribute, NULL },
1346   { "arm_sve_vector_bits", 1, 1, false, true,  false, true,
1347                           aarch64_sve::handle_arm_sve_vector_bits_attribute,
1348                           NULL },
1349   { "SVE type",           3, 3, false, true,  false, true,  NULL, NULL },
1350   { "SVE sizeless type",  0, 0, false, true,  false, true,  NULL, NULL },
1351   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
1352 };
1353
1354 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1355
1356 /* An ISA extension in the co-processor and main instruction set space.  */
1357 struct aarch64_option_extension
1358 {
1359   const char *const name;
1360   const unsigned long flags_on;
1361   const unsigned long flags_off;
1362 };
1363
1364 typedef enum aarch64_cond_code
1365 {
1366   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1367   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1368   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1369 }
1370 aarch64_cc;
1371
1372 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1373
1374 struct aarch64_branch_protect_type
1375 {
1376   /* The type's name that the user passes to the branch-protection option
1377     string.  */
1378   const char* name;
1379   /* Function to handle the protection type and set global variables.
1380     First argument is the string token corresponding with this type and the
1381     second argument is the next token in the option string.
1382     Return values:
1383     * AARCH64_PARSE_OK: Handling was sucessful.
1384     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1385       should print an error.
1386     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1387       own error.  */
1388   enum aarch64_parse_opt_result (*handler)(char*, char*);
1389   /* A list of types that can follow this type in the option string.  */
1390   const aarch64_branch_protect_type* subtypes;
1391   unsigned int num_subtypes;
1392 };
1393
1394 static enum aarch64_parse_opt_result
1395 aarch64_handle_no_branch_protection (char* str, char* rest)
1396 {
1397   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1398   aarch64_enable_bti = 0;
1399   if (rest)
1400     {
1401       error ("unexpected %<%s%> after %<%s%>", rest, str);
1402       return AARCH64_PARSE_INVALID_FEATURE;
1403     }
1404   return AARCH64_PARSE_OK;
1405 }
1406
1407 static enum aarch64_parse_opt_result
1408 aarch64_handle_standard_branch_protection (char* str, char* rest)
1409 {
1410   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1411   aarch64_ra_sign_key = AARCH64_KEY_A;
1412   aarch64_enable_bti = 1;
1413   if (rest)
1414     {
1415       error ("unexpected %<%s%> after %<%s%>", rest, str);
1416       return AARCH64_PARSE_INVALID_FEATURE;
1417     }
1418   return AARCH64_PARSE_OK;
1419 }
1420
1421 static enum aarch64_parse_opt_result
1422 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1423                                     char* rest ATTRIBUTE_UNUSED)
1424 {
1425   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1426   aarch64_ra_sign_key = AARCH64_KEY_A;
1427   return AARCH64_PARSE_OK;
1428 }
1429
1430 static enum aarch64_parse_opt_result
1431 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1432                               char* rest ATTRIBUTE_UNUSED)
1433 {
1434   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1435   return AARCH64_PARSE_OK;
1436 }
1437
1438 static enum aarch64_parse_opt_result
1439 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1440                               char* rest ATTRIBUTE_UNUSED)
1441 {
1442   aarch64_ra_sign_key = AARCH64_KEY_B;
1443   return AARCH64_PARSE_OK;
1444 }
1445
1446 static enum aarch64_parse_opt_result
1447 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1448                                     char* rest ATTRIBUTE_UNUSED)
1449 {
1450   aarch64_enable_bti = 1;
1451   return AARCH64_PARSE_OK;
1452 }
1453
1454 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1455   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1456   { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1457   { NULL, NULL, NULL, 0 }
1458 };
1459
1460 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1461   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1462   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1463   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1464     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1465   { "bti", aarch64_handle_bti_protection, NULL, 0 },
1466   { NULL, NULL, NULL, 0 }
1467 };
1468
1469 /* The condition codes of the processor, and the inverse function.  */
1470 static const char * const aarch64_condition_codes[] =
1471 {
1472   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1473   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1474 };
1475
1476 /* The preferred condition codes for SVE conditions.  */
1477 static const char *const aarch64_sve_condition_codes[] =
1478 {
1479   "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1480   "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1481 };
1482
1483 /* Return the assembly token for svpattern value VALUE.  */
1484
1485 static const char *
1486 svpattern_token (enum aarch64_svpattern pattern)
1487 {
1488   switch (pattern)
1489     {
1490 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1491     AARCH64_FOR_SVPATTERN (CASE)
1492 #undef CASE
1493     case AARCH64_NUM_SVPATTERNS:
1494       break;
1495     }
1496   gcc_unreachable ();
1497 }
1498
1499 /* Return the location of a piece that is known to be passed or returned
1500    in registers.  FIRST_ZR is the first unused vector argument register
1501    and FIRST_PR is the first unused predicate argument register.  */
1502
1503 rtx
1504 pure_scalable_type_info::piece::get_rtx (unsigned int first_zr,
1505                                          unsigned int first_pr) const
1506 {
1507   gcc_assert (VECTOR_MODE_P (mode)
1508               && first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS
1509               && first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS);
1510
1511   if (num_zr > 0 && num_pr == 0)
1512     return gen_rtx_REG (mode, first_zr);
1513
1514   if (num_zr == 0 && num_pr == 1)
1515     return gen_rtx_REG (mode, first_pr);
1516
1517   gcc_unreachable ();
1518 }
1519
1520 /* Return the total number of vector registers required by the PST.  */
1521
1522 unsigned int
1523 pure_scalable_type_info::num_zr () const
1524 {
1525   unsigned int res = 0;
1526   for (unsigned int i = 0; i < pieces.length (); ++i)
1527     res += pieces[i].num_zr;
1528   return res;
1529 }
1530
1531 /* Return the total number of predicate registers required by the PST.  */
1532
1533 unsigned int
1534 pure_scalable_type_info::num_pr () const
1535 {
1536   unsigned int res = 0;
1537   for (unsigned int i = 0; i < pieces.length (); ++i)
1538     res += pieces[i].num_pr;
1539   return res;
1540 }
1541
1542 /* Return the location of a PST that is known to be passed or returned
1543    in registers.  FIRST_ZR is the first unused vector argument register
1544    and FIRST_PR is the first unused predicate argument register.  */
1545
1546 rtx
1547 pure_scalable_type_info::get_rtx (machine_mode mode,
1548                                   unsigned int first_zr,
1549                                   unsigned int first_pr) const
1550 {
1551   /* Try to return a single REG if possible.  This leads to better
1552      code generation; it isn't required for correctness.  */
1553   if (mode == pieces[0].mode)
1554     {
1555       gcc_assert (pieces.length () == 1);
1556       return pieces[0].get_rtx (first_zr, first_pr);
1557     }
1558
1559   /* Build up a PARALLEL that contains the individual pieces.  */
1560   rtvec rtxes = rtvec_alloc (pieces.length ());
1561   for (unsigned int i = 0; i < pieces.length (); ++i)
1562     {
1563       rtx reg = pieces[i].get_rtx (first_zr, first_pr);
1564       rtx offset = gen_int_mode (pieces[i].offset, Pmode);
1565       RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset);
1566       first_zr += pieces[i].num_zr;
1567       first_pr += pieces[i].num_pr;
1568     }
1569   return gen_rtx_PARALLEL (mode, rtxes);
1570 }
1571
1572 /* Analyze whether TYPE is a Pure Scalable Type according to the rules
1573    in the AAPCS64.  */
1574
1575 pure_scalable_type_info::analysis_result
1576 pure_scalable_type_info::analyze (const_tree type)
1577 {
1578   /* Prevent accidental reuse.  */
1579   gcc_assert (pieces.is_empty ());
1580
1581   /* No code will be generated for erroneous types, so we won't establish
1582      an ABI mapping.  */
1583   if (type == error_mark_node)
1584     return NO_ABI_IDENTITY;
1585
1586   /* Zero-sized types disappear in the language->ABI mapping.  */
1587   if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
1588     return NO_ABI_IDENTITY;
1589
1590   /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs.  */
1591   piece p = {};
1592   if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr))
1593     {
1594       machine_mode mode = TYPE_MODE_RAW (type);
1595       gcc_assert (VECTOR_MODE_P (mode)
1596                   && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
1597
1598       p.mode = p.orig_mode = mode;
1599       add_piece (p);
1600       return IS_PST;
1601     }
1602
1603   /* Check for user-defined PSTs.  */
1604   if (TREE_CODE (type) == ARRAY_TYPE)
1605     return analyze_array (type);
1606   if (TREE_CODE (type) == RECORD_TYPE)
1607     return analyze_record (type);
1608
1609   return ISNT_PST;
1610 }
1611
1612 /* Analyze a type that is known not to be passed or returned in memory.
1613    Return true if it has an ABI identity and is a Pure Scalable Type.  */
1614
1615 bool
1616 pure_scalable_type_info::analyze_registers (const_tree type)
1617 {
1618   analysis_result result = analyze (type);
1619   gcc_assert (result != DOESNT_MATTER);
1620   return result == IS_PST;
1621 }
1622
1623 /* Subroutine of analyze for handling ARRAY_TYPEs.  */
1624
1625 pure_scalable_type_info::analysis_result
1626 pure_scalable_type_info::analyze_array (const_tree type)
1627 {
1628   /* Analyze the element type.  */
1629   pure_scalable_type_info element_info;
1630   analysis_result result = element_info.analyze (TREE_TYPE (type));
1631   if (result != IS_PST)
1632     return result;
1633
1634   /* An array of unknown, flexible or variable length will be passed and
1635      returned by reference whatever we do.  */
1636   tree nelts_minus_one = array_type_nelts (type);
1637   if (!tree_fits_uhwi_p (nelts_minus_one))
1638     return DOESNT_MATTER;
1639
1640   /* Likewise if the array is constant-sized but too big to be interesting.
1641      The double checks against MAX_PIECES are to protect against overflow.  */
1642   unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one);
1643   if (count > MAX_PIECES)
1644     return DOESNT_MATTER;
1645   count += 1;
1646   if (count * element_info.pieces.length () > MAX_PIECES)
1647     return DOESNT_MATTER;
1648
1649   /* The above checks should have weeded out elements of unknown size.  */
1650   poly_uint64 element_bytes;
1651   if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes))
1652     gcc_unreachable ();
1653
1654   /* Build up the list of individual vectors and predicates.  */
1655   gcc_assert (!element_info.pieces.is_empty ());
1656   for (unsigned int i = 0; i < count; ++i)
1657     for (unsigned int j = 0; j < element_info.pieces.length (); ++j)
1658       {
1659         piece p = element_info.pieces[j];
1660         p.offset += i * element_bytes;
1661         add_piece (p);
1662       }
1663   return IS_PST;
1664 }
1665
1666 /* Subroutine of analyze for handling RECORD_TYPEs.  */
1667
1668 pure_scalable_type_info::analysis_result
1669 pure_scalable_type_info::analyze_record (const_tree type)
1670 {
1671   for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
1672     {
1673       if (TREE_CODE (field) != FIELD_DECL)
1674         continue;
1675
1676       /* Zero-sized fields disappear in the language->ABI mapping.  */
1677       if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field)))
1678         continue;
1679
1680       /* All fields with an ABI identity must be PSTs for the record as
1681          a whole to be a PST.  If any individual field is too big to be
1682          interesting then the record is too.  */
1683       pure_scalable_type_info field_info;
1684       analysis_result subresult = field_info.analyze (TREE_TYPE (field));
1685       if (subresult == NO_ABI_IDENTITY)
1686         continue;
1687       if (subresult != IS_PST)
1688         return subresult;
1689
1690       /* Since all previous fields are PSTs, we ought to be able to track
1691          the field offset using poly_ints.  */
1692       tree bitpos = bit_position (field);
1693       gcc_assert (poly_int_tree_p (bitpos));
1694
1695       /* For the same reason, it shouldn't be possible to create a PST field
1696          whose offset isn't byte-aligned.  */
1697       poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos),
1698                                                 BITS_PER_UNIT);
1699
1700       /* Punt if the record is too big to be interesting.  */
1701       poly_uint64 bytepos;
1702       if (!wide_bytepos.to_uhwi (&bytepos)
1703           || pieces.length () + field_info.pieces.length () > MAX_PIECES)
1704         return DOESNT_MATTER;
1705
1706       /* Add the individual vectors and predicates in the field to the
1707          record's list.  */
1708       gcc_assert (!field_info.pieces.is_empty ());
1709       for (unsigned int i = 0; i < field_info.pieces.length (); ++i)
1710         {
1711           piece p = field_info.pieces[i];
1712           p.offset += bytepos;
1713           add_piece (p);
1714         }
1715     }
1716   /* Empty structures disappear in the language->ABI mapping.  */
1717   return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST;
1718 }
1719
1720 /* Add P to the list of pieces in the type.  */
1721
1722 void
1723 pure_scalable_type_info::add_piece (const piece &p)
1724 {
1725   /* Try to fold the new piece into the previous one to form a
1726      single-mode PST.  For example, if we see three consecutive vectors
1727      of the same mode, we can represent them using the corresponding
1728      3-tuple mode.
1729
1730      This is purely an optimization.  */
1731   if (!pieces.is_empty ())
1732     {
1733       piece &prev = pieces.last ();
1734       gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode));
1735       unsigned int nelems1, nelems2;
1736       if (prev.orig_mode == p.orig_mode
1737           && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset)
1738           && constant_multiple_p (GET_MODE_NUNITS (prev.mode),
1739                                   GET_MODE_NUNITS (p.orig_mode), &nelems1)
1740           && constant_multiple_p (GET_MODE_NUNITS (p.mode),
1741                                   GET_MODE_NUNITS (p.orig_mode), &nelems2)
1742           && targetm.array_mode (p.orig_mode,
1743                                  nelems1 + nelems2).exists (&prev.mode))
1744         {
1745           prev.num_zr += p.num_zr;
1746           prev.num_pr += p.num_pr;
1747           return;
1748         }
1749     }
1750   pieces.quick_push (p);
1751 }
1752
1753 /* Return true if at least one possible value of type TYPE includes at
1754    least one object of Pure Scalable Type, in the sense of the AAPCS64.
1755
1756    This is a relatively expensive test for some types, so it should
1757    generally be made as late as possible.  */
1758
1759 static bool
1760 aarch64_some_values_include_pst_objects_p (const_tree type)
1761 {
1762   if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
1763     return false;
1764
1765   if (aarch64_sve::builtin_type_p (type))
1766     return true;
1767
1768   if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE)
1769     return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type));
1770
1771   if (RECORD_OR_UNION_TYPE_P (type))
1772     for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
1773       if (TREE_CODE (field) == FIELD_DECL
1774           && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field)))
1775         return true;
1776
1777   return false;
1778 }
1779
1780 /* Return the descriptor of the SIMD ABI.  */
1781
1782 static const predefined_function_abi &
1783 aarch64_simd_abi (void)
1784 {
1785   predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
1786   if (!simd_abi.initialized_p ())
1787     {
1788       HARD_REG_SET full_reg_clobbers
1789         = default_function_abi.full_reg_clobbers ();
1790       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1791         if (FP_SIMD_SAVED_REGNUM_P (regno))
1792           CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1793       simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
1794     }
1795   return simd_abi;
1796 }
1797
1798 /* Return the descriptor of the SVE PCS.  */
1799
1800 static const predefined_function_abi &
1801 aarch64_sve_abi (void)
1802 {
1803   predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
1804   if (!sve_abi.initialized_p ())
1805     {
1806       HARD_REG_SET full_reg_clobbers
1807         = default_function_abi.full_reg_clobbers ();
1808       for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
1809         CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1810       for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
1811         CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1812       sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
1813     }
1814   return sve_abi;
1815 }
1816
1817 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1818 const char *
1819 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1820                         const char * branch_format)
1821 {
1822     rtx_code_label * tmp_label = gen_label_rtx ();
1823     char label_buf[256];
1824     char buffer[128];
1825     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1826                                  CODE_LABEL_NUMBER (tmp_label));
1827     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1828     rtx dest_label = operands[pos_label];
1829     operands[pos_label] = tmp_label;
1830
1831     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1832     output_asm_insn (buffer, operands);
1833
1834     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1835     operands[pos_label] = dest_label;
1836     output_asm_insn (buffer, operands);
1837     return "";
1838 }
1839
1840 void
1841 aarch64_err_no_fpadvsimd (machine_mode mode)
1842 {
1843   if (TARGET_GENERAL_REGS_ONLY)
1844     if (FLOAT_MODE_P (mode))
1845       error ("%qs is incompatible with the use of floating-point types",
1846              "-mgeneral-regs-only");
1847     else
1848       error ("%qs is incompatible with the use of vector types",
1849              "-mgeneral-regs-only");
1850   else
1851     if (FLOAT_MODE_P (mode))
1852       error ("%qs feature modifier is incompatible with the use of"
1853              " floating-point types", "+nofp");
1854     else
1855       error ("%qs feature modifier is incompatible with the use of"
1856              " vector types", "+nofp");
1857 }
1858
1859 /* Report when we try to do something that requires SVE when SVE is disabled.
1860    This is an error of last resort and isn't very high-quality.  It usually
1861    involves attempts to measure the vector length in some way.  */
1862 static void
1863 aarch64_report_sve_required (void)
1864 {
1865   static bool reported_p = false;
1866
1867   /* Avoid reporting a slew of messages for a single oversight.  */
1868   if (reported_p)
1869     return;
1870
1871   error ("this operation requires the SVE ISA extension");
1872   inform (input_location, "you can enable SVE using the command-line"
1873           " option %<-march%>, or by using the %<target%>"
1874           " attribute or pragma");
1875   reported_p = true;
1876 }
1877
1878 /* Return true if REGNO is P0-P15 or one of the special FFR-related
1879    registers.  */
1880 inline bool
1881 pr_or_ffr_regnum_p (unsigned int regno)
1882 {
1883   return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
1884 }
1885
1886 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1887    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1888    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1889    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1890    and GENERAL_REGS is lower than the memory cost (in this case the best class
1891    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1892    cost results in bad allocations with many redundant int<->FP moves which
1893    are expensive on various cores.
1894    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1895    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1896    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1897    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1898    The result of this is that it is no longer inefficient to have a higher
1899    memory move cost than the register move cost.
1900 */
1901
1902 static reg_class_t
1903 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1904                                          reg_class_t best_class)
1905 {
1906   machine_mode mode;
1907
1908   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1909       || !reg_class_subset_p (FP_REGS, allocno_class))
1910     return allocno_class;
1911
1912   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1913       || !reg_class_subset_p (FP_REGS, best_class))
1914     return best_class;
1915
1916   mode = PSEUDO_REGNO_MODE (regno);
1917   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1918 }
1919
1920 static unsigned int
1921 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1922 {
1923   if (GET_MODE_UNIT_SIZE (mode) == 4)
1924     return aarch64_tune_params.min_div_recip_mul_sf;
1925   return aarch64_tune_params.min_div_recip_mul_df;
1926 }
1927
1928 /* Return the reassociation width of treeop OPC with mode MODE.  */
1929 static int
1930 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1931 {
1932   if (VECTOR_MODE_P (mode))
1933     return aarch64_tune_params.vec_reassoc_width;
1934   if (INTEGRAL_MODE_P (mode))
1935     return aarch64_tune_params.int_reassoc_width;
1936   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1937   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1938     return aarch64_tune_params.fp_reassoc_width;
1939   return 1;
1940 }
1941
1942 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1943 unsigned
1944 aarch64_dbx_register_number (unsigned regno)
1945 {
1946    if (GP_REGNUM_P (regno))
1947      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1948    else if (regno == SP_REGNUM)
1949      return AARCH64_DWARF_SP;
1950    else if (FP_REGNUM_P (regno))
1951      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1952    else if (PR_REGNUM_P (regno))
1953      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1954    else if (regno == VG_REGNUM)
1955      return AARCH64_DWARF_VG;
1956
1957    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1958       equivalent DWARF register.  */
1959    return DWARF_FRAME_REGISTERS;
1960 }
1961
1962 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1963    integer, otherwise return X unmodified.  */
1964 static rtx
1965 aarch64_bit_representation (rtx x)
1966 {
1967   if (CONST_DOUBLE_P (x))
1968     x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
1969   return x;
1970 }
1971
1972 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1973 static bool
1974 aarch64_advsimd_struct_mode_p (machine_mode mode)
1975 {
1976   return (TARGET_SIMD
1977           && (mode == OImode || mode == CImode || mode == XImode));
1978 }
1979
1980 /* Return true if MODE is an SVE predicate mode.  */
1981 static bool
1982 aarch64_sve_pred_mode_p (machine_mode mode)
1983 {
1984   return (TARGET_SVE
1985           && (mode == VNx16BImode
1986               || mode == VNx8BImode
1987               || mode == VNx4BImode
1988               || mode == VNx2BImode));
1989 }
1990
1991 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1992 const unsigned int VEC_ADVSIMD  = 1;
1993 const unsigned int VEC_SVE_DATA = 2;
1994 const unsigned int VEC_SVE_PRED = 4;
1995 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1996    a structure of 2, 3 or 4 vectors.  */
1997 const unsigned int VEC_STRUCT   = 8;
1998 /* Can be used in combination with VEC_SVE_DATA to indicate that the
1999    vector has fewer significant bytes than a full SVE vector.  */
2000 const unsigned int VEC_PARTIAL  = 16;
2001 /* Useful combinations of the above.  */
2002 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
2003 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
2004
2005 /* Return a set of flags describing the vector properties of mode MODE.
2006    Ignore modes that are not supported by the current target.  */
2007 static unsigned int
2008 aarch64_classify_vector_mode (machine_mode mode)
2009 {
2010   if (aarch64_advsimd_struct_mode_p (mode))
2011     return VEC_ADVSIMD | VEC_STRUCT;
2012
2013   if (aarch64_sve_pred_mode_p (mode))
2014     return VEC_SVE_PRED;
2015
2016   /* Make the decision based on the mode's enum value rather than its
2017      properties, so that we keep the correct classification regardless
2018      of -msve-vector-bits.  */
2019   switch (mode)
2020     {
2021     /* Partial SVE QI vectors.  */
2022     case E_VNx2QImode:
2023     case E_VNx4QImode:
2024     case E_VNx8QImode:
2025     /* Partial SVE HI vectors.  */
2026     case E_VNx2HImode:
2027     case E_VNx4HImode:
2028     /* Partial SVE SI vector.  */
2029     case E_VNx2SImode:
2030     /* Partial SVE HF vectors.  */
2031     case E_VNx2HFmode:
2032     case E_VNx4HFmode:
2033     /* Partial SVE SF vector.  */
2034     case E_VNx2SFmode:
2035       return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
2036
2037     case E_VNx16QImode:
2038     case E_VNx8HImode:
2039     case E_VNx4SImode:
2040     case E_VNx2DImode:
2041     case E_VNx8BFmode:
2042     case E_VNx8HFmode:
2043     case E_VNx4SFmode:
2044     case E_VNx2DFmode:
2045       return TARGET_SVE ? VEC_SVE_DATA : 0;
2046
2047     /* x2 SVE vectors.  */
2048     case E_VNx32QImode:
2049     case E_VNx16HImode:
2050     case E_VNx8SImode:
2051     case E_VNx4DImode:
2052     case E_VNx16BFmode:
2053     case E_VNx16HFmode:
2054     case E_VNx8SFmode:
2055     case E_VNx4DFmode:
2056     /* x3 SVE vectors.  */
2057     case E_VNx48QImode:
2058     case E_VNx24HImode:
2059     case E_VNx12SImode:
2060     case E_VNx6DImode:
2061     case E_VNx24BFmode:
2062     case E_VNx24HFmode:
2063     case E_VNx12SFmode:
2064     case E_VNx6DFmode:
2065     /* x4 SVE vectors.  */
2066     case E_VNx64QImode:
2067     case E_VNx32HImode:
2068     case E_VNx16SImode:
2069     case E_VNx8DImode:
2070     case E_VNx32BFmode:
2071     case E_VNx32HFmode:
2072     case E_VNx16SFmode:
2073     case E_VNx8DFmode:
2074       return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
2075
2076     /* 64-bit Advanced SIMD vectors.  */
2077     case E_V8QImode:
2078     case E_V4HImode:
2079     case E_V2SImode:
2080     /* ...E_V1DImode doesn't exist.  */
2081     case E_V4HFmode:
2082     case E_V4BFmode:
2083     case E_V2SFmode:
2084     case E_V1DFmode:
2085     /* 128-bit Advanced SIMD vectors.  */
2086     case E_V16QImode:
2087     case E_V8HImode:
2088     case E_V4SImode:
2089     case E_V2DImode:
2090     case E_V8HFmode:
2091     case E_V8BFmode:
2092     case E_V4SFmode:
2093     case E_V2DFmode:
2094       return TARGET_SIMD ? VEC_ADVSIMD : 0;
2095
2096     default:
2097       return 0;
2098     }
2099 }
2100
2101 /* Return true if MODE is any of the data vector modes, including
2102    structure modes.  */
2103 static bool
2104 aarch64_vector_data_mode_p (machine_mode mode)
2105 {
2106   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
2107 }
2108
2109 /* Return true if MODE is any form of SVE mode, including predicates,
2110    vectors and structures.  */
2111 bool
2112 aarch64_sve_mode_p (machine_mode mode)
2113 {
2114   return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
2115 }
2116
2117 /* Return true if MODE is an SVE data vector mode; either a single vector
2118    or a structure of vectors.  */
2119 static bool
2120 aarch64_sve_data_mode_p (machine_mode mode)
2121 {
2122   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
2123 }
2124
2125 /* Return the number of defined bytes in one constituent vector of
2126    SVE mode MODE, which has vector flags VEC_FLAGS.  */
2127 static poly_int64
2128 aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
2129 {
2130   if (vec_flags & VEC_PARTIAL)
2131     /* A single partial vector.  */
2132     return GET_MODE_SIZE (mode);
2133
2134   if (vec_flags & VEC_SVE_DATA)
2135     /* A single vector or a tuple.  */
2136     return BYTES_PER_SVE_VECTOR;
2137
2138   /* A single predicate.  */
2139   gcc_assert (vec_flags & VEC_SVE_PRED);
2140   return BYTES_PER_SVE_PRED;
2141 }
2142
2143 /* Implement target hook TARGET_ARRAY_MODE.  */
2144 static opt_machine_mode
2145 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
2146 {
2147   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
2148       && IN_RANGE (nelems, 2, 4))
2149     return mode_for_vector (GET_MODE_INNER (mode),
2150                             GET_MODE_NUNITS (mode) * nelems);
2151
2152   return opt_machine_mode ();
2153 }
2154
2155 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
2156 static bool
2157 aarch64_array_mode_supported_p (machine_mode mode,
2158                                 unsigned HOST_WIDE_INT nelems)
2159 {
2160   if (TARGET_SIMD
2161       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
2162           || AARCH64_VALID_SIMD_DREG_MODE (mode))
2163       && (nelems >= 2 && nelems <= 4))
2164     return true;
2165
2166   return false;
2167 }
2168
2169 /* MODE is some form of SVE vector mode.  For data modes, return the number
2170    of vector register bits that each element of MODE occupies, such as 64
2171    for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
2172    in a 64-bit container).  For predicate modes, return the number of
2173    data bits controlled by each significant predicate bit.  */
2174
2175 static unsigned int
2176 aarch64_sve_container_bits (machine_mode mode)
2177 {
2178   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2179   poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
2180                              ? BITS_PER_SVE_VECTOR
2181                              : GET_MODE_BITSIZE (mode));
2182   return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
2183 }
2184
2185 /* Return the SVE predicate mode to use for elements that have
2186    ELEM_NBYTES bytes, if such a mode exists.  */
2187
2188 opt_machine_mode
2189 aarch64_sve_pred_mode (unsigned int elem_nbytes)
2190 {
2191   if (TARGET_SVE)
2192     {
2193       if (elem_nbytes == 1)
2194         return VNx16BImode;
2195       if (elem_nbytes == 2)
2196         return VNx8BImode;
2197       if (elem_nbytes == 4)
2198         return VNx4BImode;
2199       if (elem_nbytes == 8)
2200         return VNx2BImode;
2201     }
2202   return opt_machine_mode ();
2203 }
2204
2205 /* Return the SVE predicate mode that should be used to control
2206    SVE mode MODE.  */
2207
2208 machine_mode
2209 aarch64_sve_pred_mode (machine_mode mode)
2210 {
2211   unsigned int bits = aarch64_sve_container_bits (mode);
2212   return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
2213 }
2214
2215 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
2216
2217 static opt_machine_mode
2218 aarch64_get_mask_mode (machine_mode mode)
2219 {
2220   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2221   if (vec_flags & VEC_SVE_DATA)
2222     return aarch64_sve_pred_mode (mode);
2223
2224   return default_get_mask_mode (mode);
2225 }
2226
2227 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE.  */
2228
2229 opt_machine_mode
2230 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
2231 {
2232   enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
2233                             ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
2234   machine_mode mode;
2235   FOR_EACH_MODE_IN_CLASS (mode, mclass)
2236     if (inner_mode == GET_MODE_INNER (mode)
2237         && known_eq (nunits, GET_MODE_NUNITS (mode))
2238         && aarch64_sve_data_mode_p (mode))
2239       return mode;
2240   return opt_machine_mode ();
2241 }
2242
2243 /* Return the integer element mode associated with SVE mode MODE.  */
2244
2245 static scalar_int_mode
2246 aarch64_sve_element_int_mode (machine_mode mode)
2247 {
2248   poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2249                              ? BITS_PER_SVE_VECTOR
2250                              : GET_MODE_BITSIZE (mode));
2251   unsigned int elt_bits = vector_element_size (vector_bits,
2252                                                GET_MODE_NUNITS (mode));
2253   return int_mode_for_size (elt_bits, 0).require ();
2254 }
2255
2256 /* Return an integer element mode that contains exactly
2257    aarch64_sve_container_bits (MODE) bits.  This is wider than
2258    aarch64_sve_element_int_mode if MODE is a partial vector,
2259    otherwise it's the same.  */
2260
2261 static scalar_int_mode
2262 aarch64_sve_container_int_mode (machine_mode mode)
2263 {
2264   return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
2265 }
2266
2267 /* Return the integer vector mode associated with SVE mode MODE.
2268    Unlike related_int_vector_mode, this can handle the case in which
2269    MODE is a predicate (and thus has a different total size).  */
2270
2271 machine_mode
2272 aarch64_sve_int_mode (machine_mode mode)
2273 {
2274   scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
2275   return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
2276 }
2277
2278 /* Implement TARGET_VECTORIZE_RELATED_MODE.  */
2279
2280 static opt_machine_mode
2281 aarch64_vectorize_related_mode (machine_mode vector_mode,
2282                                 scalar_mode element_mode,
2283                                 poly_uint64 nunits)
2284 {
2285   unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
2286
2287   /* If we're operating on SVE vectors, try to return an SVE mode.  */
2288   poly_uint64 sve_nunits;
2289   if ((vec_flags & VEC_SVE_DATA)
2290       && multiple_p (BYTES_PER_SVE_VECTOR,
2291                      GET_MODE_SIZE (element_mode), &sve_nunits))
2292     {
2293       machine_mode sve_mode;
2294       if (maybe_ne (nunits, 0U))
2295         {
2296           /* Try to find a full or partial SVE mode with exactly
2297              NUNITS units.  */
2298           if (multiple_p (sve_nunits, nunits)
2299               && aarch64_sve_data_mode (element_mode,
2300                                         nunits).exists (&sve_mode))
2301             return sve_mode;
2302         }
2303       else
2304         {
2305           /* Take the preferred number of units from the number of bytes
2306              that fit in VECTOR_MODE.  We always start by "autodetecting"
2307              a full vector mode with preferred_simd_mode, so vectors
2308              chosen here will also be full vector modes.  Then
2309              autovectorize_vector_modes tries smaller starting modes
2310              and thus smaller preferred numbers of units.  */
2311           sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
2312           if (aarch64_sve_data_mode (element_mode,
2313                                      sve_nunits).exists (&sve_mode))
2314             return sve_mode;
2315         }
2316     }
2317
2318   /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors.  */
2319   if ((vec_flags & VEC_ADVSIMD)
2320       && known_eq (nunits, 0U)
2321       && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
2322       && maybe_ge (GET_MODE_BITSIZE (element_mode)
2323                    * GET_MODE_NUNITS (vector_mode), 128U))
2324     {
2325       machine_mode res = aarch64_simd_container_mode (element_mode, 128);
2326       if (VECTOR_MODE_P (res))
2327         return res;
2328     }
2329
2330   return default_vectorize_related_mode (vector_mode, element_mode, nunits);
2331 }
2332
2333 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
2334    prefer to use the first arithmetic operand as the else value if
2335    the else value doesn't matter, since that exactly matches the SVE
2336    destructive merging form.  For ternary operations we could either
2337    pick the first operand and use FMAD-like instructions or the last
2338    operand and use FMLA-like instructions; the latter seems more
2339    natural.  */
2340
2341 static tree
2342 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
2343 {
2344   return nops == 3 ? ops[2] : ops[0];
2345 }
2346
2347 /* Implement TARGET_HARD_REGNO_NREGS.  */
2348
2349 static unsigned int
2350 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
2351 {
2352   /* ??? Logically we should only need to provide a value when
2353      HARD_REGNO_MODE_OK says that the combination is valid,
2354      but at the moment we need to handle all modes.  Just ignore
2355      any runtime parts for registers that can't store them.  */
2356   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
2357   switch (aarch64_regno_regclass (regno))
2358     {
2359     case FP_REGS:
2360     case FP_LO_REGS:
2361     case FP_LO8_REGS:
2362       {
2363         unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2364         if (vec_flags & VEC_SVE_DATA)
2365           return exact_div (GET_MODE_SIZE (mode),
2366                             aarch64_vl_bytes (mode, vec_flags)).to_constant ();
2367         return CEIL (lowest_size, UNITS_PER_VREG);
2368       }
2369     case PR_REGS:
2370     case PR_LO_REGS:
2371     case PR_HI_REGS:
2372     case FFR_REGS:
2373     case PR_AND_FFR_REGS:
2374       return 1;
2375     default:
2376       return CEIL (lowest_size, UNITS_PER_WORD);
2377     }
2378   gcc_unreachable ();
2379 }
2380
2381 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
2382
2383 static bool
2384 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
2385 {
2386   if (GET_MODE_CLASS (mode) == MODE_CC)
2387     return regno == CC_REGNUM;
2388
2389   if (regno == VG_REGNUM)
2390     /* This must have the same size as _Unwind_Word.  */
2391     return mode == DImode;
2392
2393   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2394   if (vec_flags & VEC_SVE_PRED)
2395     return pr_or_ffr_regnum_p (regno);
2396
2397   if (pr_or_ffr_regnum_p (regno))
2398     return false;
2399
2400   if (regno == SP_REGNUM)
2401     /* The purpose of comparing with ptr_mode is to support the
2402        global register variable associated with the stack pointer
2403        register via the syntax of asm ("wsp") in ILP32.  */
2404     return mode == Pmode || mode == ptr_mode;
2405
2406   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
2407     return mode == Pmode;
2408
2409   if (GP_REGNUM_P (regno))
2410     {
2411       if (vec_flags & VEC_ANY_SVE)
2412         return false;
2413       if (known_le (GET_MODE_SIZE (mode), 8))
2414         return true;
2415       if (known_le (GET_MODE_SIZE (mode), 16))
2416         return (regno & 1) == 0;
2417     }
2418   else if (FP_REGNUM_P (regno))
2419     {
2420       if (vec_flags & VEC_STRUCT)
2421         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
2422       else
2423         return !VECTOR_MODE_P (mode) || vec_flags != 0;
2424     }
2425
2426   return false;
2427 }
2428
2429 /* Return true if a function with type FNTYPE returns its value in
2430    SVE vector or predicate registers.  */
2431
2432 static bool
2433 aarch64_returns_value_in_sve_regs_p (const_tree fntype)
2434 {
2435   tree return_type = TREE_TYPE (fntype);
2436
2437   pure_scalable_type_info pst_info;
2438   switch (pst_info.analyze (return_type))
2439     {
2440     case pure_scalable_type_info::IS_PST:
2441       return (pst_info.num_zr () <= NUM_FP_ARG_REGS
2442               && pst_info.num_pr () <= NUM_PR_ARG_REGS);
2443
2444     case pure_scalable_type_info::DOESNT_MATTER:
2445       gcc_assert (aarch64_return_in_memory_1 (return_type));
2446       return false;
2447
2448     case pure_scalable_type_info::NO_ABI_IDENTITY:
2449     case pure_scalable_type_info::ISNT_PST:
2450       return false;
2451     }
2452   gcc_unreachable ();
2453 }
2454
2455 /* Return true if a function with type FNTYPE takes arguments in
2456    SVE vector or predicate registers.  */
2457
2458 static bool
2459 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
2460 {
2461   CUMULATIVE_ARGS args_so_far_v;
2462   aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
2463                                 NULL_TREE, 0, true);
2464   cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
2465
2466   for (tree chain = TYPE_ARG_TYPES (fntype);
2467        chain && chain != void_list_node;
2468        chain = TREE_CHAIN (chain))
2469     {
2470       tree arg_type = TREE_VALUE (chain);
2471       if (arg_type == error_mark_node)
2472         return false;
2473
2474       function_arg_info arg (arg_type, /*named=*/true);
2475       apply_pass_by_reference_rules (&args_so_far_v, arg);
2476       pure_scalable_type_info pst_info;
2477       if (pst_info.analyze_registers (arg.type))
2478         {
2479           unsigned int end_zr = args_so_far_v.aapcs_nvrn + pst_info.num_zr ();
2480           unsigned int end_pr = args_so_far_v.aapcs_nprn + pst_info.num_pr ();
2481           gcc_assert (end_zr <= NUM_FP_ARG_REGS && end_pr <= NUM_PR_ARG_REGS);
2482           return true;
2483         }
2484
2485       targetm.calls.function_arg_advance (args_so_far, arg);
2486     }
2487   return false;
2488 }
2489
2490 /* Implement TARGET_FNTYPE_ABI.  */
2491
2492 static const predefined_function_abi &
2493 aarch64_fntype_abi (const_tree fntype)
2494 {
2495   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
2496     return aarch64_simd_abi ();
2497
2498   if (aarch64_returns_value_in_sve_regs_p (fntype)
2499       || aarch64_takes_arguments_in_sve_regs_p (fntype))
2500     return aarch64_sve_abi ();
2501
2502   return default_function_abi;
2503 }
2504
2505 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P.  */
2506
2507 static bool
2508 aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
2509 {
2510   return (aarch64_sve::builtin_type_p (type1)
2511           == aarch64_sve::builtin_type_p (type2));
2512 }
2513
2514 /* Return true if we should emit CFI for register REGNO.  */
2515
2516 static bool
2517 aarch64_emit_cfi_for_reg_p (unsigned int regno)
2518 {
2519   return (GP_REGNUM_P (regno)
2520           || !default_function_abi.clobbers_full_reg_p (regno));
2521 }
2522
2523 /* Return the mode we should use to save and restore register REGNO.  */
2524
2525 static machine_mode
2526 aarch64_reg_save_mode (unsigned int regno)
2527 {
2528   if (GP_REGNUM_P (regno))
2529     return DImode;
2530
2531   if (FP_REGNUM_P (regno))
2532     switch (crtl->abi->id ())
2533       {
2534       case ARM_PCS_AAPCS64:
2535         /* Only the low 64 bits are saved by the base PCS.  */
2536         return DFmode;
2537
2538       case ARM_PCS_SIMD:
2539         /* The vector PCS saves the low 128 bits (which is the full
2540            register on non-SVE targets).  */
2541         return TFmode;
2542
2543       case ARM_PCS_SVE:
2544         /* Use vectors of DImode for registers that need frame
2545            information, so that the first 64 bytes of the save slot
2546            are always the equivalent of what storing D<n> would give.  */
2547         if (aarch64_emit_cfi_for_reg_p (regno))
2548           return VNx2DImode;
2549
2550         /* Use vectors of bytes otherwise, so that the layout is
2551            endian-agnostic, and so that we can use LDR and STR for
2552            big-endian targets.  */
2553         return VNx16QImode;
2554
2555       case ARM_PCS_TLSDESC:
2556       case ARM_PCS_UNKNOWN:
2557         break;
2558       }
2559
2560   if (PR_REGNUM_P (regno))
2561     /* Save the full predicate register.  */
2562     return VNx16BImode;
2563
2564   gcc_unreachable ();
2565 }
2566
2567 /* Implement TARGET_INSN_CALLEE_ABI.  */
2568
2569 const predefined_function_abi &
2570 aarch64_insn_callee_abi (const rtx_insn *insn)
2571 {
2572   rtx pat = PATTERN (insn);
2573   gcc_assert (GET_CODE (pat) == PARALLEL);
2574   rtx unspec = XVECEXP (pat, 0, 1);
2575   gcc_assert (GET_CODE (unspec) == UNSPEC
2576               && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
2577   return function_abis[INTVAL (XVECEXP (unspec, 0, 0))];
2578 }
2579
2580 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
2581    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
2582    clobbers the top 64 bits when restoring the bottom 64 bits.  */
2583
2584 static bool
2585 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
2586                                         unsigned int regno,
2587                                         machine_mode mode)
2588 {
2589   if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
2590     {
2591       poly_int64 per_register_size = GET_MODE_SIZE (mode);
2592       unsigned int nregs = hard_regno_nregs (regno, mode);
2593       if (nregs > 1)
2594         per_register_size = exact_div (per_register_size, nregs);
2595       if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
2596         return maybe_gt (per_register_size, 16);
2597       return maybe_gt (per_register_size, 8);
2598     }
2599   return false;
2600 }
2601
2602 /* Implement REGMODE_NATURAL_SIZE.  */
2603 poly_uint64
2604 aarch64_regmode_natural_size (machine_mode mode)
2605 {
2606   /* The natural size for SVE data modes is one SVE data vector,
2607      and similarly for predicates.  We can't independently modify
2608      anything smaller than that.  */
2609   /* ??? For now, only do this for variable-width SVE registers.
2610      Doing it for constant-sized registers breaks lower-subreg.c.  */
2611   /* ??? And once that's fixed, we should probably have similar
2612      code for Advanced SIMD.  */
2613   if (!aarch64_sve_vg.is_constant ())
2614     {
2615       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2616       if (vec_flags & VEC_SVE_PRED)
2617         return BYTES_PER_SVE_PRED;
2618       if (vec_flags & VEC_SVE_DATA)
2619         return BYTES_PER_SVE_VECTOR;
2620     }
2621   return UNITS_PER_WORD;
2622 }
2623
2624 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
2625 machine_mode
2626 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
2627                                      machine_mode mode)
2628 {
2629   /* The predicate mode determines which bits are significant and
2630      which are "don't care".  Decreasing the number of lanes would
2631      lose data while increasing the number of lanes would make bits
2632      unnecessarily significant.  */
2633   if (PR_REGNUM_P (regno))
2634     return mode;
2635   if (known_ge (GET_MODE_SIZE (mode), 4))
2636     return mode;
2637   else
2638     return SImode;
2639 }
2640
2641 /* Return true if I's bits are consecutive ones from the MSB.  */
2642 bool
2643 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
2644 {
2645   return exact_log2 (-i) != HOST_WIDE_INT_M1;
2646 }
2647
2648 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
2649    that strcpy from constants will be faster.  */
2650
2651 static HOST_WIDE_INT
2652 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
2653 {
2654   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
2655     return MAX (align, BITS_PER_WORD);
2656   return align;
2657 }
2658
2659 /* Return true if calls to DECL should be treated as
2660    long-calls (ie called via a register).  */
2661 static bool
2662 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
2663 {
2664   return false;
2665 }
2666
2667 /* Return true if calls to symbol-ref SYM should be treated as
2668    long-calls (ie called via a register).  */
2669 bool
2670 aarch64_is_long_call_p (rtx sym)
2671 {
2672   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
2673 }
2674
2675 /* Return true if calls to symbol-ref SYM should not go through
2676    plt stubs.  */
2677
2678 bool
2679 aarch64_is_noplt_call_p (rtx sym)
2680 {
2681   const_tree decl = SYMBOL_REF_DECL (sym);
2682
2683   if (flag_pic
2684       && decl
2685       && (!flag_plt
2686           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
2687       && !targetm.binds_local_p (decl))
2688     return true;
2689
2690   return false;
2691 }
2692
2693 /* Return true if the offsets to a zero/sign-extract operation
2694    represent an expression that matches an extend operation.  The
2695    operands represent the parameters from
2696
2697    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
2698 bool
2699 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
2700                                 rtx extract_imm)
2701 {
2702   HOST_WIDE_INT mult_val, extract_val;
2703
2704   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
2705     return false;
2706
2707   mult_val = INTVAL (mult_imm);
2708   extract_val = INTVAL (extract_imm);
2709
2710   if (extract_val > 8
2711       && extract_val < GET_MODE_BITSIZE (mode)
2712       && exact_log2 (extract_val & ~7) > 0
2713       && (extract_val & 7) <= 4
2714       && mult_val == (1 << (extract_val & 7)))
2715     return true;
2716
2717   return false;
2718 }
2719
2720 /* Emit an insn that's a simple single-set.  Both the operands must be
2721    known to be valid.  */
2722 inline static rtx_insn *
2723 emit_set_insn (rtx x, rtx y)
2724 {
2725   return emit_insn (gen_rtx_SET (x, y));
2726 }
2727
2728 /* X and Y are two things to compare using CODE.  Emit the compare insn and
2729    return the rtx for register 0 in the proper mode.  */
2730 rtx
2731 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2732 {
2733   machine_mode cmp_mode = GET_MODE (x);
2734   machine_mode cc_mode;
2735   rtx cc_reg;
2736
2737   if (cmp_mode == TImode)
2738     {
2739       gcc_assert (code == NE);
2740
2741       cc_mode = CCmode;
2742       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2743
2744       rtx x_lo = operand_subword (x, 0, 0, TImode);
2745       rtx y_lo = operand_subword (y, 0, 0, TImode);
2746       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
2747
2748       rtx x_hi = operand_subword (x, 1, 0, TImode);
2749       rtx y_hi = operand_subword (y, 1, 0, TImode);
2750       emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
2751                                gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
2752                                GEN_INT (AARCH64_EQ)));
2753     }
2754   else
2755     {
2756       cc_mode = SELECT_CC_MODE (code, x, y);
2757       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2758       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
2759     }
2760   return cc_reg;
2761 }
2762
2763 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
2764
2765 static rtx
2766 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2767                                   machine_mode y_mode)
2768 {
2769   if (y_mode == E_QImode || y_mode == E_HImode)
2770     {
2771       if (CONST_INT_P (y))
2772         {
2773           y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2774           y_mode = SImode;
2775         }
2776       else
2777         {
2778           rtx t, cc_reg;
2779           machine_mode cc_mode;
2780
2781           t = gen_rtx_ZERO_EXTEND (SImode, y);
2782           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2783           cc_mode = CC_SWPmode;
2784           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2785           emit_set_insn (cc_reg, t);
2786           return cc_reg;
2787         }
2788     }
2789
2790   if (!aarch64_plus_operand (y, y_mode))
2791     y = force_reg (y_mode, y);
2792
2793   return aarch64_gen_compare_reg (code, x, y);
2794 }
2795
2796 /* Build the SYMBOL_REF for __tls_get_addr.  */
2797
2798 static GTY(()) rtx tls_get_addr_libfunc;
2799
2800 rtx
2801 aarch64_tls_get_addr (void)
2802 {
2803   if (!tls_get_addr_libfunc)
2804     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2805   return tls_get_addr_libfunc;
2806 }
2807
2808 /* Return the TLS model to use for ADDR.  */
2809
2810 static enum tls_model
2811 tls_symbolic_operand_type (rtx addr)
2812 {
2813   enum tls_model tls_kind = TLS_MODEL_NONE;
2814   if (GET_CODE (addr) == CONST)
2815     {
2816       poly_int64 addend;
2817       rtx sym = strip_offset (addr, &addend);
2818       if (GET_CODE (sym) == SYMBOL_REF)
2819         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2820     }
2821   else if (GET_CODE (addr) == SYMBOL_REF)
2822     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2823
2824   return tls_kind;
2825 }
2826
2827 /* We'll allow lo_sum's in addresses in our legitimate addresses
2828    so that combine would take care of combining addresses where
2829    necessary, but for generation purposes, we'll generate the address
2830    as :
2831    RTL                               Absolute
2832    tmp = hi (symbol_ref);            adrp  x1, foo
2833    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
2834                                      nop
2835
2836    PIC                               TLS
2837    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
2838    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
2839                                      bl   __tls_get_addr
2840                                      nop
2841
2842    Load TLS symbol, depending on TLS mechanism and TLS access model.
2843
2844    Global Dynamic - Traditional TLS:
2845    adrp tmp, :tlsgd:imm
2846    add  dest, tmp, #:tlsgd_lo12:imm
2847    bl   __tls_get_addr
2848
2849    Global Dynamic - TLS Descriptors:
2850    adrp dest, :tlsdesc:imm
2851    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
2852    add  dest, dest, #:tlsdesc_lo12:imm
2853    blr  tmp
2854    mrs  tp, tpidr_el0
2855    add  dest, dest, tp
2856
2857    Initial Exec:
2858    mrs  tp, tpidr_el0
2859    adrp tmp, :gottprel:imm
2860    ldr  dest, [tmp, #:gottprel_lo12:imm]
2861    add  dest, dest, tp
2862
2863    Local Exec:
2864    mrs  tp, tpidr_el0
2865    add  t0, tp, #:tprel_hi12:imm, lsl #12
2866    add  t0, t0, #:tprel_lo12_nc:imm
2867 */
2868
2869 static void
2870 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2871                                    enum aarch64_symbol_type type)
2872 {
2873   switch (type)
2874     {
2875     case SYMBOL_SMALL_ABSOLUTE:
2876       {
2877         /* In ILP32, the mode of dest can be either SImode or DImode.  */
2878         rtx tmp_reg = dest;
2879         machine_mode mode = GET_MODE (dest);
2880
2881         gcc_assert (mode == Pmode || mode == ptr_mode);
2882
2883         if (can_create_pseudo_p ())
2884           tmp_reg = gen_reg_rtx (mode);
2885
2886         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2887         emit_insn (gen_add_losym (dest, tmp_reg, imm));
2888         return;
2889       }
2890
2891     case SYMBOL_TINY_ABSOLUTE:
2892       emit_insn (gen_rtx_SET (dest, imm));
2893       return;
2894
2895     case SYMBOL_SMALL_GOT_28K:
2896       {
2897         machine_mode mode = GET_MODE (dest);
2898         rtx gp_rtx = pic_offset_table_rtx;
2899         rtx insn;
2900         rtx mem;
2901
2902         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2903            here before rtl expand.  Tree IVOPT will generate rtl pattern to
2904            decide rtx costs, in which case pic_offset_table_rtx is not
2905            initialized.  For that case no need to generate the first adrp
2906            instruction as the final cost for global variable access is
2907            one instruction.  */
2908         if (gp_rtx != NULL)
2909           {
2910             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2911                using the page base as GOT base, the first page may be wasted,
2912                in the worst scenario, there is only 28K space for GOT).
2913
2914                The generate instruction sequence for accessing global variable
2915                is:
2916
2917                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2918
2919                Only one instruction needed. But we must initialize
2920                pic_offset_table_rtx properly.  We generate initialize insn for
2921                every global access, and allow CSE to remove all redundant.
2922
2923                The final instruction sequences will look like the following
2924                for multiply global variables access.
2925
2926                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2927
2928                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2929                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2930                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2931                  ...  */
2932
2933             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2934             crtl->uses_pic_offset_table = 1;
2935             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2936
2937             if (mode != GET_MODE (gp_rtx))
2938              gp_rtx = gen_lowpart (mode, gp_rtx);
2939
2940           }
2941
2942         if (mode == ptr_mode)
2943           {
2944             if (mode == DImode)
2945               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2946             else
2947               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2948
2949             mem = XVECEXP (SET_SRC (insn), 0, 0);
2950           }
2951         else
2952           {
2953             gcc_assert (mode == Pmode);
2954
2955             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2956             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2957           }
2958
2959         /* The operand is expected to be MEM.  Whenever the related insn
2960            pattern changed, above code which calculate mem should be
2961            updated.  */
2962         gcc_assert (GET_CODE (mem) == MEM);
2963         MEM_READONLY_P (mem) = 1;
2964         MEM_NOTRAP_P (mem) = 1;
2965         emit_insn (insn);
2966         return;
2967       }
2968
2969     case SYMBOL_SMALL_GOT_4G:
2970       {
2971         /* In ILP32, the mode of dest can be either SImode or DImode,
2972            while the got entry is always of SImode size.  The mode of
2973            dest depends on how dest is used: if dest is assigned to a
2974            pointer (e.g. in the memory), it has SImode; it may have
2975            DImode if dest is dereferenced to access the memeory.
2976            This is why we have to handle three different ldr_got_small
2977            patterns here (two patterns for ILP32).  */
2978
2979         rtx insn;
2980         rtx mem;
2981         rtx tmp_reg = dest;
2982         machine_mode mode = GET_MODE (dest);
2983
2984         if (can_create_pseudo_p ())
2985           tmp_reg = gen_reg_rtx (mode);
2986
2987         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2988         if (mode == ptr_mode)
2989           {
2990             if (mode == DImode)
2991               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2992             else
2993               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2994
2995             mem = XVECEXP (SET_SRC (insn), 0, 0);
2996           }
2997         else
2998           {
2999             gcc_assert (mode == Pmode);
3000
3001             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
3002             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
3003           }
3004
3005         gcc_assert (GET_CODE (mem) == MEM);
3006         MEM_READONLY_P (mem) = 1;
3007         MEM_NOTRAP_P (mem) = 1;
3008         emit_insn (insn);
3009         return;
3010       }
3011
3012     case SYMBOL_SMALL_TLSGD:
3013       {
3014         rtx_insn *insns;
3015         /* The return type of __tls_get_addr is the C pointer type
3016            so use ptr_mode.  */
3017         rtx result = gen_rtx_REG (ptr_mode, R0_REGNUM);
3018         rtx tmp_reg = dest;
3019
3020         if (GET_MODE (dest) != ptr_mode)
3021           tmp_reg = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : result;
3022
3023         start_sequence ();
3024         if (ptr_mode == SImode)
3025           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
3026         else
3027           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
3028         insns = get_insns ();
3029         end_sequence ();
3030
3031         RTL_CONST_CALL_P (insns) = 1;
3032         emit_libcall_block (insns, tmp_reg, result, imm);
3033         /* Convert back to the mode of the dest adding a zero_extend
3034            from SImode (ptr_mode) to DImode (Pmode). */
3035         if (dest != tmp_reg)
3036           convert_move (dest, tmp_reg, true);
3037         return;
3038       }
3039
3040     case SYMBOL_SMALL_TLSDESC:
3041       {
3042         machine_mode mode = GET_MODE (dest);
3043         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
3044         rtx tp;
3045
3046         gcc_assert (mode == Pmode || mode == ptr_mode);
3047
3048         /* In ILP32, the got entry is always of SImode size.  Unlike
3049            small GOT, the dest is fixed at reg 0.  */
3050         if (TARGET_ILP32)
3051           emit_insn (gen_tlsdesc_small_si (imm));
3052         else
3053           emit_insn (gen_tlsdesc_small_di (imm));
3054         tp = aarch64_load_tp (NULL);
3055
3056         if (mode != Pmode)
3057           tp = gen_lowpart (mode, tp);
3058
3059         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
3060         if (REG_P (dest))
3061           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3062         return;
3063       }
3064
3065     case SYMBOL_SMALL_TLSIE:
3066       {
3067         /* In ILP32, the mode of dest can be either SImode or DImode,
3068            while the got entry is always of SImode size.  The mode of
3069            dest depends on how dest is used: if dest is assigned to a
3070            pointer (e.g. in the memory), it has SImode; it may have
3071            DImode if dest is dereferenced to access the memeory.
3072            This is why we have to handle three different tlsie_small
3073            patterns here (two patterns for ILP32).  */
3074         machine_mode mode = GET_MODE (dest);
3075         rtx tmp_reg = gen_reg_rtx (mode);
3076         rtx tp = aarch64_load_tp (NULL);
3077
3078         if (mode == ptr_mode)
3079           {
3080             if (mode == DImode)
3081               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
3082             else
3083               {
3084                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
3085                 tp = gen_lowpart (mode, tp);
3086               }
3087           }
3088         else
3089           {
3090             gcc_assert (mode == Pmode);
3091             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
3092           }
3093
3094         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
3095         if (REG_P (dest))
3096           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3097         return;
3098       }
3099
3100     case SYMBOL_TLSLE12:
3101     case SYMBOL_TLSLE24:
3102     case SYMBOL_TLSLE32:
3103     case SYMBOL_TLSLE48:
3104       {
3105         machine_mode mode = GET_MODE (dest);
3106         rtx tp = aarch64_load_tp (NULL);
3107
3108         if (mode != Pmode)
3109           tp = gen_lowpart (mode, tp);
3110
3111         switch (type)
3112           {
3113           case SYMBOL_TLSLE12:
3114             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
3115                         (dest, tp, imm));
3116             break;
3117           case SYMBOL_TLSLE24:
3118             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
3119                         (dest, tp, imm));
3120           break;
3121           case SYMBOL_TLSLE32:
3122             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
3123                         (dest, imm));
3124             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3125                         (dest, dest, tp));
3126           break;
3127           case SYMBOL_TLSLE48:
3128             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
3129                         (dest, imm));
3130             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3131                         (dest, dest, tp));
3132             break;
3133           default:
3134             gcc_unreachable ();
3135           }
3136
3137         if (REG_P (dest))
3138           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3139         return;
3140       }
3141
3142     case SYMBOL_TINY_GOT:
3143       {
3144         rtx insn;
3145         machine_mode mode = GET_MODE (dest);
3146
3147         if (mode == ptr_mode)
3148           insn = gen_ldr_got_tiny (mode, dest, imm);
3149         else
3150           {
3151             gcc_assert (mode == Pmode);
3152             insn = gen_ldr_got_tiny_sidi (dest, imm);
3153           }
3154
3155         emit_insn (insn);
3156         return;
3157       }
3158
3159     case SYMBOL_TINY_TLSIE:
3160       {
3161         machine_mode mode = GET_MODE (dest);
3162         rtx tp = aarch64_load_tp (NULL);
3163
3164         if (mode == ptr_mode)
3165           {
3166             if (mode == DImode)
3167               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
3168             else
3169               {
3170                 tp = gen_lowpart (mode, tp);
3171                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
3172               }
3173           }
3174         else
3175           {
3176             gcc_assert (mode == Pmode);
3177             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
3178           }
3179
3180         if (REG_P (dest))
3181           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3182         return;
3183       }
3184
3185     default:
3186       gcc_unreachable ();
3187     }
3188 }
3189
3190 /* Emit a move from SRC to DEST.  Assume that the move expanders can
3191    handle all moves if !can_create_pseudo_p ().  The distinction is
3192    important because, unlike emit_move_insn, the move expanders know
3193    how to force Pmode objects into the constant pool even when the
3194    constant pool address is not itself legitimate.  */
3195 static rtx
3196 aarch64_emit_move (rtx dest, rtx src)
3197 {
3198   return (can_create_pseudo_p ()
3199           ? emit_move_insn (dest, src)
3200           : emit_move_insn_1 (dest, src));
3201 }
3202
3203 /* Apply UNOPTAB to OP and store the result in DEST.  */
3204
3205 static void
3206 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
3207 {
3208   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
3209   if (dest != tmp)
3210     emit_move_insn (dest, tmp);
3211 }
3212
3213 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
3214
3215 static void
3216 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
3217 {
3218   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
3219                           OPTAB_DIRECT);
3220   if (dest != tmp)
3221     emit_move_insn (dest, tmp);
3222 }
3223
3224 /* Split a 128-bit move operation into two 64-bit move operations,
3225    taking care to handle partial overlap of register to register
3226    copies.  Special cases are needed when moving between GP regs and
3227    FP regs.  SRC can be a register, constant or memory; DST a register
3228    or memory.  If either operand is memory it must not have any side
3229    effects.  */
3230 void
3231 aarch64_split_128bit_move (rtx dst, rtx src)
3232 {
3233   rtx dst_lo, dst_hi;
3234   rtx src_lo, src_hi;
3235
3236   machine_mode mode = GET_MODE (dst);
3237
3238   gcc_assert (mode == TImode || mode == TFmode);
3239   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
3240   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
3241
3242   if (REG_P (dst) && REG_P (src))
3243     {
3244       int src_regno = REGNO (src);
3245       int dst_regno = REGNO (dst);
3246
3247       /* Handle FP <-> GP regs.  */
3248       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
3249         {
3250           src_lo = gen_lowpart (word_mode, src);
3251           src_hi = gen_highpart (word_mode, src);
3252
3253           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
3254           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
3255           return;
3256         }
3257       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
3258         {
3259           dst_lo = gen_lowpart (word_mode, dst);
3260           dst_hi = gen_highpart (word_mode, dst);
3261
3262           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
3263           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
3264           return;
3265         }
3266     }
3267
3268   dst_lo = gen_lowpart (word_mode, dst);
3269   dst_hi = gen_highpart (word_mode, dst);
3270   src_lo = gen_lowpart (word_mode, src);
3271   src_hi = gen_highpart_mode (word_mode, mode, src);
3272
3273   /* At most one pairing may overlap.  */
3274   if (reg_overlap_mentioned_p (dst_lo, src_hi))
3275     {
3276       aarch64_emit_move (dst_hi, src_hi);
3277       aarch64_emit_move (dst_lo, src_lo);
3278     }
3279   else
3280     {
3281       aarch64_emit_move (dst_lo, src_lo);
3282       aarch64_emit_move (dst_hi, src_hi);
3283     }
3284 }
3285
3286 bool
3287 aarch64_split_128bit_move_p (rtx dst, rtx src)
3288 {
3289   return (! REG_P (src)
3290           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
3291 }
3292
3293 /* Split a complex SIMD combine.  */
3294
3295 void
3296 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
3297 {
3298   machine_mode src_mode = GET_MODE (src1);
3299   machine_mode dst_mode = GET_MODE (dst);
3300
3301   gcc_assert (VECTOR_MODE_P (dst_mode));
3302   gcc_assert (register_operand (dst, dst_mode)
3303               && register_operand (src1, src_mode)
3304               && register_operand (src2, src_mode));
3305
3306   emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
3307   return;
3308 }
3309
3310 /* Split a complex SIMD move.  */
3311
3312 void
3313 aarch64_split_simd_move (rtx dst, rtx src)
3314 {
3315   machine_mode src_mode = GET_MODE (src);
3316   machine_mode dst_mode = GET_MODE (dst);
3317
3318   gcc_assert (VECTOR_MODE_P (dst_mode));
3319
3320   if (REG_P (dst) && REG_P (src))
3321     {
3322       gcc_assert (VECTOR_MODE_P (src_mode));
3323       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
3324     }
3325 }
3326
3327 bool
3328 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
3329                               machine_mode ymode, rtx y)
3330 {
3331   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
3332   gcc_assert (r != NULL);
3333   return rtx_equal_p (x, r);
3334 }
3335
3336 /* Return TARGET if it is nonnull and a register of mode MODE.
3337    Otherwise, return a fresh register of mode MODE if we can,
3338    or TARGET reinterpreted as MODE if we can't.  */
3339
3340 static rtx
3341 aarch64_target_reg (rtx target, machine_mode mode)
3342 {
3343   if (target && REG_P (target) && GET_MODE (target) == mode)
3344     return target;
3345   if (!can_create_pseudo_p ())
3346     {
3347       gcc_assert (target);
3348       return gen_lowpart (mode, target);
3349     }
3350   return gen_reg_rtx (mode);
3351 }
3352
3353 /* Return a register that contains the constant in BUILDER, given that
3354    the constant is a legitimate move operand.  Use TARGET as the register
3355    if it is nonnull and convenient.  */
3356
3357 static rtx
3358 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
3359 {
3360   rtx src = builder.build ();
3361   target = aarch64_target_reg (target, GET_MODE (src));
3362   emit_insn (gen_rtx_SET (target, src));
3363   return target;
3364 }
3365
3366 static rtx
3367 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
3368 {
3369   if (can_create_pseudo_p ())
3370     return force_reg (mode, value);
3371   else
3372     {
3373       gcc_assert (x);
3374       aarch64_emit_move (x, value);
3375       return x;
3376     }
3377 }
3378
3379 /* Return true if predicate value X is a constant in which every element
3380    is a CONST_INT.  When returning true, describe X in BUILDER as a VNx16BI
3381    value, i.e. as a predicate in which all bits are significant.  */
3382
3383 static bool
3384 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
3385 {
3386   if (GET_CODE (x) != CONST_VECTOR)
3387     return false;
3388
3389   unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
3390                                              GET_MODE_NUNITS (GET_MODE (x)));
3391   unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
3392   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
3393   builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
3394
3395   unsigned int nelts = const_vector_encoded_nelts (x);
3396   for (unsigned int i = 0; i < nelts; ++i)
3397     {
3398       rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
3399       if (!CONST_INT_P (elt))
3400         return false;
3401
3402       builder.quick_push (elt);
3403       for (unsigned int j = 1; j < factor; ++j)
3404         builder.quick_push (const0_rtx);
3405     }
3406   builder.finalize ();
3407   return true;
3408 }
3409
3410 /* BUILDER contains a predicate constant of mode VNx16BI.  Return the
3411    widest predicate element size it can have (that is, the largest size
3412    for which each element would still be 0 or 1).  */
3413
3414 unsigned int
3415 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
3416 {
3417   /* Start with the most optimistic assumption: that we only need
3418      one bit per pattern.  This is what we will use if only the first
3419      bit in each pattern is ever set.  */
3420   unsigned int mask = GET_MODE_SIZE (DImode);
3421   mask |= builder.npatterns ();
3422
3423   /* Look for set bits.  */
3424   unsigned int nelts = builder.encoded_nelts ();
3425   for (unsigned int i = 1; i < nelts; ++i)
3426     if (INTVAL (builder.elt (i)) != 0)
3427       {
3428         if (i & 1)
3429           return 1;
3430         mask |= i;
3431       }
3432   return mask & -mask;
3433 }
3434
3435 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
3436    return that predicate mode, otherwise return opt_machine_mode ().  */
3437
3438 opt_machine_mode
3439 aarch64_ptrue_all_mode (rtx x)
3440 {
3441   gcc_assert (GET_MODE (x) == VNx16BImode);
3442   if (GET_CODE (x) != CONST_VECTOR
3443       || !CONST_VECTOR_DUPLICATE_P (x)
3444       || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
3445       || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
3446     return opt_machine_mode ();
3447
3448   unsigned int nelts = const_vector_encoded_nelts (x);
3449   for (unsigned int i = 1; i < nelts; ++i)
3450     if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
3451       return opt_machine_mode ();
3452
3453   return aarch64_sve_pred_mode (nelts);
3454 }
3455
3456 /* BUILDER is a predicate constant of mode VNx16BI.  Consider the value
3457    that the constant would have with predicate element size ELT_SIZE
3458    (ignoring the upper bits in each element) and return:
3459
3460    * -1 if all bits are set
3461    * N if the predicate has N leading set bits followed by all clear bits
3462    * 0 if the predicate does not have any of these forms.  */
3463
3464 int
3465 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
3466                               unsigned int elt_size)
3467 {
3468   /* If nelts_per_pattern is 3, we have set bits followed by clear bits
3469      followed by set bits.  */
3470   if (builder.nelts_per_pattern () == 3)
3471     return 0;
3472
3473   /* Skip over leading set bits.  */
3474   unsigned int nelts = builder.encoded_nelts ();
3475   unsigned int i = 0;
3476   for (; i < nelts; i += elt_size)
3477     if (INTVAL (builder.elt (i)) == 0)
3478       break;
3479   unsigned int vl = i / elt_size;
3480
3481   /* Check for the all-true case.  */
3482   if (i == nelts)
3483     return -1;
3484
3485   /* If nelts_per_pattern is 1, then either VL is zero, or we have a
3486      repeating pattern of set bits followed by clear bits.  */
3487   if (builder.nelts_per_pattern () != 2)
3488     return 0;
3489
3490   /* We have a "foreground" value and a duplicated "background" value.
3491      If the background might repeat and the last set bit belongs to it,
3492      we might have set bits followed by clear bits followed by set bits.  */
3493   if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
3494     return 0;
3495
3496   /* Make sure that the rest are all clear.  */
3497   for (; i < nelts; i += elt_size)
3498     if (INTVAL (builder.elt (i)) != 0)
3499       return 0;
3500
3501   return vl;
3502 }
3503
3504 /* See if there is an svpattern that encodes an SVE predicate of mode
3505    PRED_MODE in which the first VL bits are set and the rest are clear.
3506    Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
3507    A VL of -1 indicates an all-true vector.  */
3508
3509 aarch64_svpattern
3510 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
3511 {
3512   if (vl < 0)
3513     return AARCH64_SV_ALL;
3514
3515   if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
3516     return AARCH64_NUM_SVPATTERNS;
3517
3518   if (vl >= 1 && vl <= 8)
3519     return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
3520
3521   if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
3522     return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
3523
3524   int max_vl;
3525   if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
3526     {
3527       if (vl == (max_vl / 3) * 3)
3528         return AARCH64_SV_MUL3;
3529       /* These would only trigger for non-power-of-2 lengths.  */
3530       if (vl == (max_vl & -4))
3531         return AARCH64_SV_MUL4;
3532       if (vl == (1 << floor_log2 (max_vl)))
3533         return AARCH64_SV_POW2;
3534       if (vl == max_vl)
3535         return AARCH64_SV_ALL;
3536     }
3537   return AARCH64_NUM_SVPATTERNS;
3538 }
3539
3540 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
3541    bits has the lowest bit set and the upper bits clear.  This is the
3542    VNx16BImode equivalent of a PTRUE for controlling elements of
3543    ELT_SIZE bytes.  However, because the constant is VNx16BImode,
3544    all bits are significant, even the upper zeros.  */
3545
3546 rtx
3547 aarch64_ptrue_all (unsigned int elt_size)
3548 {
3549   rtx_vector_builder builder (VNx16BImode, elt_size, 1);
3550   builder.quick_push (const1_rtx);
3551   for (unsigned int i = 1; i < elt_size; ++i)
3552     builder.quick_push (const0_rtx);
3553   return builder.build ();
3554 }
3555
3556 /* Return an all-true predicate register of mode MODE.  */
3557
3558 rtx
3559 aarch64_ptrue_reg (machine_mode mode)
3560 {
3561   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
3562   rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3563   return gen_lowpart (mode, reg);
3564 }
3565
3566 /* Return an all-false predicate register of mode MODE.  */
3567
3568 rtx
3569 aarch64_pfalse_reg (machine_mode mode)
3570 {
3571   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
3572   rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
3573   return gen_lowpart (mode, reg);
3574 }
3575
3576 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
3577    true, or alternatively if we know that the operation predicated by
3578    PRED1[0] is safe to perform whenever PRED2 is true.  PRED1[1] is a
3579    aarch64_sve_gp_strictness operand that describes the operation
3580    predicated by PRED1[0].  */
3581
3582 bool
3583 aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
3584 {
3585   machine_mode mode = GET_MODE (pred2);
3586   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3587               && mode == GET_MODE (pred1[0])
3588               && aarch64_sve_gp_strictness (pred1[1], SImode));
3589   return (pred1[0] == CONSTM1_RTX (mode)
3590           || INTVAL (pred1[1]) == SVE_RELAXED_GP
3591           || rtx_equal_p (pred1[0], pred2));
3592 }
3593
3594 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
3595    for it.  PRED2[0] is the predicate for the instruction whose result
3596    is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
3597    for it.  Return true if we can prove that the two predicates are
3598    equivalent for PTEST purposes; that is, if we can replace PRED2[0]
3599    with PRED1[0] without changing behavior.  */
3600
3601 bool
3602 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
3603 {
3604   machine_mode mode = GET_MODE (pred1[0]);
3605   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3606               && mode == GET_MODE (pred2[0])
3607               && aarch64_sve_ptrue_flag (pred1[1], SImode)
3608               && aarch64_sve_ptrue_flag (pred2[1], SImode));
3609
3610   bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
3611                    || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
3612   bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
3613                    || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
3614   return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
3615 }
3616
3617 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
3618    DATA_MODE, and return the result in a predicate of mode PRED_MODE.
3619    Use TARGET as the target register if nonnull and convenient.  */
3620
3621 static rtx
3622 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
3623                           machine_mode data_mode, rtx op1, rtx op2)
3624 {
3625   insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
3626   expand_operand ops[5];
3627   create_output_operand (&ops[0], target, pred_mode);
3628   create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
3629   create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
3630   create_input_operand (&ops[3], op1, data_mode);
3631   create_input_operand (&ops[4], op2, data_mode);
3632   expand_insn (icode, 5, ops);
3633   return ops[0].value;
3634 }
3635
3636 /* Use a comparison to convert integer vector SRC into MODE, which is
3637    the corresponding SVE predicate mode.  Use TARGET for the result
3638    if it's nonnull and convenient.  */
3639
3640 rtx
3641 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
3642 {
3643   machine_mode src_mode = GET_MODE (src);
3644   return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
3645                                    src, CONST0_RTX (src_mode));
3646 }
3647
3648 /* Return the assembly token for svprfop value PRFOP.  */
3649
3650 static const char *
3651 svprfop_token (enum aarch64_svprfop prfop)
3652 {
3653   switch (prfop)
3654     {
3655 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
3656     AARCH64_FOR_SVPRFOP (CASE)
3657 #undef CASE
3658     case AARCH64_NUM_SVPRFOPS:
3659       break;
3660     }
3661   gcc_unreachable ();
3662 }
3663
3664 /* Return the assembly string for an SVE prefetch operation with
3665    mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
3666    and that SUFFIX is the format for the remaining operands.  */
3667
3668 char *
3669 aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
3670                              const char *suffix)
3671 {
3672   static char buffer[128];
3673   aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
3674   unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
3675                                    mnemonic, svprfop_token (prfop), suffix);
3676   gcc_assert (written < sizeof (buffer));
3677   return buffer;
3678 }
3679
3680 /* Check whether we can calculate the number of elements in PATTERN
3681    at compile time, given that there are NELTS_PER_VQ elements per
3682    128-bit block.  Return the value if so, otherwise return -1.  */
3683
3684 HOST_WIDE_INT
3685 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
3686 {
3687   unsigned int vl, const_vg;
3688   if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
3689     vl = 1 + (pattern - AARCH64_SV_VL1);
3690   else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
3691     vl = 16 << (pattern - AARCH64_SV_VL16);
3692   else if (aarch64_sve_vg.is_constant (&const_vg))
3693     {
3694       /* There are two vector granules per quadword.  */
3695       unsigned int nelts = (const_vg / 2) * nelts_per_vq;
3696       switch (pattern)
3697         {
3698         case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
3699         case AARCH64_SV_MUL4: return nelts & -4;
3700         case AARCH64_SV_MUL3: return (nelts / 3) * 3;
3701         case AARCH64_SV_ALL: return nelts;
3702         default: gcc_unreachable ();
3703         }
3704     }
3705   else
3706     return -1;
3707
3708   /* There are two vector granules per quadword.  */
3709   poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
3710   if (known_le (vl, nelts_all))
3711     return vl;
3712
3713   /* Requesting more elements than are available results in a PFALSE.  */
3714   if (known_gt (vl, nelts_all))
3715     return 0;
3716
3717   return -1;
3718 }
3719
3720 /* Return true if we can move VALUE into a register using a single
3721    CNT[BHWD] instruction.  */
3722
3723 static bool
3724 aarch64_sve_cnt_immediate_p (poly_int64 value)
3725 {
3726   HOST_WIDE_INT factor = value.coeffs[0];
3727   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
3728   return (value.coeffs[1] == factor
3729           && IN_RANGE (factor, 2, 16 * 16)
3730           && (factor & 1) == 0
3731           && factor <= 16 * (factor & -factor));
3732 }
3733
3734 /* Likewise for rtx X.  */
3735
3736 bool
3737 aarch64_sve_cnt_immediate_p (rtx x)
3738 {
3739   poly_int64 value;
3740   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
3741 }
3742
3743 /* Return the asm string for an instruction with a CNT-like vector size
3744    operand (a vector pattern followed by a multiplier in the range [1, 16]).
3745    PREFIX is the mnemonic without the size suffix and OPERANDS is the
3746    first part of the operands template (the part that comes before the
3747    vector size itself).  PATTERN is the pattern to use.  FACTOR is the
3748    number of quadwords.  NELTS_PER_VQ, if nonzero, is the number of elements
3749    in each quadword.  If it is zero, we can use any element size.  */
3750
3751 static char *
3752 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3753                                   aarch64_svpattern pattern,
3754                                   unsigned int factor,
3755                                   unsigned int nelts_per_vq)
3756 {
3757   static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
3758
3759   if (nelts_per_vq == 0)
3760     /* There is some overlap in the ranges of the four CNT instructions.
3761        Here we always use the smallest possible element size, so that the
3762        multiplier is 1 whereever possible.  */
3763     nelts_per_vq = factor & -factor;
3764   int shift = std::min (exact_log2 (nelts_per_vq), 4);
3765   gcc_assert (IN_RANGE (shift, 1, 4));
3766   char suffix = "dwhb"[shift - 1];
3767
3768   factor >>= shift;
3769   unsigned int written;
3770   if (pattern == AARCH64_SV_ALL && factor == 1)
3771     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
3772                         prefix, suffix, operands);
3773   else if (factor == 1)
3774     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
3775                         prefix, suffix, operands, svpattern_token (pattern));
3776   else
3777     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
3778                         prefix, suffix, operands, svpattern_token (pattern),
3779                         factor);
3780   gcc_assert (written < sizeof (buffer));
3781   return buffer;
3782 }
3783
3784 /* Return the asm string for an instruction with a CNT-like vector size
3785    operand (a vector pattern followed by a multiplier in the range [1, 16]).
3786    PREFIX is the mnemonic without the size suffix and OPERANDS is the
3787    first part of the operands template (the part that comes before the
3788    vector size itself).  X is the value of the vector size operand,
3789    as a polynomial integer rtx; we need to convert this into an "all"
3790    pattern with a multiplier.  */
3791
3792 char *
3793 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3794                                   rtx x)
3795 {
3796   poly_int64 value = rtx_to_poly_int64 (x);
3797   gcc_assert (aarch64_sve_cnt_immediate_p (value));
3798   return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
3799                                            value.coeffs[1], 0);
3800 }
3801
3802 /* Return the asm string for an instruction with a CNT-like vector size
3803    operand (a vector pattern followed by a multiplier in the range [1, 16]).
3804    PREFIX is the mnemonic without the size suffix and OPERANDS is the
3805    first part of the operands template (the part that comes before the
3806    vector size itself).  CNT_PAT[0..2] are the operands of the
3807    UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details.  */
3808
3809 char *
3810 aarch64_output_sve_cnt_pat_immediate (const char *prefix,
3811                                       const char *operands, rtx *cnt_pat)
3812 {
3813   aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
3814   unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
3815   unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
3816   return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
3817                                            factor, nelts_per_vq);
3818 }
3819
3820 /* Return true if we can add X using a single SVE INC or DEC instruction.  */
3821
3822 bool
3823 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
3824 {
3825   poly_int64 value;
3826   return (poly_int_rtx_p (x, &value)
3827           && (aarch64_sve_cnt_immediate_p (value)
3828               || aarch64_sve_cnt_immediate_p (-value)));
3829 }
3830
3831 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
3832    operand 0.  */
3833
3834 char *
3835 aarch64_output_sve_scalar_inc_dec (rtx offset)
3836 {
3837   poly_int64 offset_value = rtx_to_poly_int64 (offset);
3838   gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
3839   if (offset_value.coeffs[1] > 0)
3840     return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
3841                                              offset_value.coeffs[1], 0);
3842   else
3843     return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
3844                                              -offset_value.coeffs[1], 0);
3845 }
3846
3847 /* Return true if we can add VALUE to a register using a single ADDVL
3848    or ADDPL instruction.  */
3849
3850 static bool
3851 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
3852 {
3853   HOST_WIDE_INT factor = value.coeffs[0];
3854   if (factor == 0 || value.coeffs[1] != factor)
3855     return false;
3856   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
3857      and a value of 16 is one vector width.  */
3858   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
3859           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
3860 }
3861
3862 /* Likewise for rtx X.  */
3863
3864 bool
3865 aarch64_sve_addvl_addpl_immediate_p (rtx x)
3866 {
3867   poly_int64 value;
3868   return (poly_int_rtx_p (x, &value)
3869           && aarch64_sve_addvl_addpl_immediate_p (value));
3870 }
3871
3872 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3873    to operand 1 and storing the result in operand 0.  */
3874
3875 char *
3876 aarch64_output_sve_addvl_addpl (rtx offset)
3877 {
3878   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3879   poly_int64 offset_value = rtx_to_poly_int64 (offset);
3880   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
3881
3882   int factor = offset_value.coeffs[1];
3883   if ((factor & 15) == 0)
3884     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
3885   else
3886     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
3887   return buffer;
3888 }
3889
3890 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3891    instruction.  If it is, store the number of elements in each vector
3892    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3893    factor in *FACTOR_OUT (if nonnull).  */
3894
3895 bool
3896 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
3897                                         unsigned int *nelts_per_vq_out)
3898 {
3899   rtx elt;
3900   poly_int64 value;
3901
3902   if (!const_vec_duplicate_p (x, &elt)
3903       || !poly_int_rtx_p (elt, &value))
3904     return false;
3905
3906   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
3907   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
3908     /* There's no vector INCB.  */
3909     return false;
3910
3911   HOST_WIDE_INT factor = value.coeffs[0];
3912   if (value.coeffs[1] != factor)
3913     return false;
3914
3915   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
3916   if ((factor % nelts_per_vq) != 0
3917       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
3918     return false;
3919
3920   if (factor_out)
3921     *factor_out = factor;
3922   if (nelts_per_vq_out)
3923     *nelts_per_vq_out = nelts_per_vq;
3924   return true;
3925 }
3926
3927 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3928    instruction.  */
3929
3930 bool
3931 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
3932 {
3933   return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
3934 }
3935
3936 /* Return the asm template for an SVE vector INC or DEC instruction.
3937    OPERANDS gives the operands before the vector count and X is the
3938    value of the vector count operand itself.  */
3939
3940 char *
3941 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
3942 {
3943   int factor;
3944   unsigned int nelts_per_vq;
3945   if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
3946     gcc_unreachable ();
3947   if (factor < 0)
3948     return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
3949                                              -factor, nelts_per_vq);
3950   else
3951     return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
3952                                              factor, nelts_per_vq);
3953 }
3954
3955 static int
3956 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
3957                                 scalar_int_mode mode)
3958 {
3959   int i;
3960   unsigned HOST_WIDE_INT val, val2, mask;
3961   int one_match, zero_match;
3962   int num_insns;
3963
3964   val = INTVAL (imm);
3965
3966   if (aarch64_move_imm (val, mode))
3967     {
3968       if (generate)
3969         emit_insn (gen_rtx_SET (dest, imm));
3970       return 1;
3971     }
3972
3973   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3974      (with XXXX non-zero). In that case check to see if the move can be done in
3975      a smaller mode.  */
3976   val2 = val & 0xffffffff;
3977   if (mode == DImode
3978       && aarch64_move_imm (val2, SImode)
3979       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
3980     {
3981       if (generate)
3982         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3983
3984       /* Check if we have to emit a second instruction by checking to see
3985          if any of the upper 32 bits of the original DI mode value is set.  */
3986       if (val == val2)
3987         return 1;
3988
3989       i = (val >> 48) ? 48 : 32;
3990
3991       if (generate)
3992          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3993                                     GEN_INT ((val >> i) & 0xffff)));
3994
3995       return 2;
3996     }
3997
3998   if ((val >> 32) == 0 || mode == SImode)
3999     {
4000       if (generate)
4001         {
4002           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
4003           if (mode == SImode)
4004             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
4005                                        GEN_INT ((val >> 16) & 0xffff)));
4006           else
4007             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4008                                        GEN_INT ((val >> 16) & 0xffff)));
4009         }
4010       return 2;
4011     }
4012
4013   /* Remaining cases are all for DImode.  */
4014
4015   mask = 0xffff;
4016   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
4017     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
4018   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
4019     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
4020
4021   if (zero_match != 2 && one_match != 2)
4022     {
4023       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
4024          For a 64-bit bitmask try whether changing 16 bits to all ones or
4025          zeroes creates a valid bitmask.  To check any repeated bitmask,
4026          try using 16 bits from the other 32-bit half of val.  */
4027
4028       for (i = 0; i < 64; i += 16, mask <<= 16)
4029         {
4030           val2 = val & ~mask;
4031           if (val2 != val && aarch64_bitmask_imm (val2, mode))
4032             break;
4033           val2 = val | mask;
4034           if (val2 != val && aarch64_bitmask_imm (val2, mode))
4035             break;
4036           val2 = val2 & ~mask;
4037           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
4038           if (val2 != val && aarch64_bitmask_imm (val2, mode))
4039             break;
4040         }
4041       if (i != 64)
4042         {
4043           if (generate)
4044             {
4045               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4046               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4047                                          GEN_INT ((val >> i) & 0xffff)));
4048             }
4049           return 2;
4050         }
4051     }
4052
4053   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
4054      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
4055      otherwise skip zero bits.  */
4056
4057   num_insns = 1;
4058   mask = 0xffff;
4059   val2 = one_match > zero_match ? ~val : val;
4060   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
4061
4062   if (generate)
4063     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
4064                                            ? (val | ~(mask << i))
4065                                            : (val & (mask << i)))));
4066   for (i += 16; i < 64; i += 16)
4067     {
4068       if ((val2 & (mask << i)) == 0)
4069         continue;
4070       if (generate)
4071         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4072                                    GEN_INT ((val >> i) & 0xffff)));
4073       num_insns ++;
4074     }
4075
4076   return num_insns;
4077 }
4078
4079 /* Return whether imm is a 128-bit immediate which is simple enough to
4080    expand inline.  */
4081 bool
4082 aarch64_mov128_immediate (rtx imm)
4083 {
4084   if (GET_CODE (imm) == CONST_INT)
4085     return true;
4086
4087   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
4088
4089   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
4090   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
4091
4092   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
4093          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
4094 }
4095
4096
4097 /* Return the number of temporary registers that aarch64_add_offset_1
4098    would need to add OFFSET to a register.  */
4099
4100 static unsigned int
4101 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
4102 {
4103   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
4104 }
4105
4106 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
4107    a non-polynomial OFFSET.  MODE is the mode of the addition.
4108    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4109    be set and CFA adjustments added to the generated instructions.
4110
4111    TEMP1, if nonnull, is a register of mode MODE that can be used as a
4112    temporary if register allocation is already complete.  This temporary
4113    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
4114    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
4115    the immediate again.
4116
4117    Since this function may be used to adjust the stack pointer, we must
4118    ensure that it cannot cause transient stack deallocation (for example
4119    by first incrementing SP and then decrementing when adjusting by a
4120    large immediate).  */
4121
4122 static void
4123 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
4124                       rtx src, HOST_WIDE_INT offset, rtx temp1,
4125                       bool frame_related_p, bool emit_move_imm)
4126 {
4127   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4128   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4129
4130   unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
4131   rtx_insn *insn;
4132
4133   if (!moffset)
4134     {
4135       if (!rtx_equal_p (dest, src))
4136         {
4137           insn = emit_insn (gen_rtx_SET (dest, src));
4138           RTX_FRAME_RELATED_P (insn) = frame_related_p;
4139         }
4140       return;
4141     }
4142
4143   /* Single instruction adjustment.  */
4144   if (aarch64_uimm12_shift (moffset))
4145     {
4146       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
4147       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4148       return;
4149     }
4150
4151   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
4152      and either:
4153
4154      a) the offset cannot be loaded by a 16-bit move or
4155      b) there is no spare register into which we can move it.  */
4156   if (moffset < 0x1000000
4157       && ((!temp1 && !can_create_pseudo_p ())
4158           || !aarch64_move_imm (moffset, mode)))
4159     {
4160       HOST_WIDE_INT low_off = moffset & 0xfff;
4161
4162       low_off = offset < 0 ? -low_off : low_off;
4163       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
4164       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4165       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
4166       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4167       return;
4168     }
4169
4170   /* Emit a move immediate if required and an addition/subtraction.  */
4171   if (emit_move_imm)
4172     {
4173       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
4174       temp1 = aarch64_force_temporary (mode, temp1,
4175                                        gen_int_mode (moffset, mode));
4176     }
4177   insn = emit_insn (offset < 0
4178                     ? gen_sub3_insn (dest, src, temp1)
4179                     : gen_add3_insn (dest, src, temp1));
4180   if (frame_related_p)
4181     {
4182       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4183       rtx adj = plus_constant (mode, src, offset);
4184       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
4185     }
4186 }
4187
4188 /* Return the number of temporary registers that aarch64_add_offset
4189    would need to move OFFSET into a register or add OFFSET to a register;
4190    ADD_P is true if we want the latter rather than the former.  */
4191
4192 static unsigned int
4193 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
4194 {
4195   /* This follows the same structure as aarch64_add_offset.  */
4196   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
4197     return 0;
4198
4199   unsigned int count = 0;
4200   HOST_WIDE_INT factor = offset.coeffs[1];
4201   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4202   poly_int64 poly_offset (factor, factor);
4203   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4204     /* Need one register for the ADDVL/ADDPL result.  */
4205     count += 1;
4206   else if (factor != 0)
4207     {
4208       factor = abs (factor);
4209       if (factor > 16 * (factor & -factor))
4210         /* Need one register for the CNT result and one for the multiplication
4211            factor.  If necessary, the second temporary can be reused for the
4212            constant part of the offset.  */
4213         return 2;
4214       /* Need one register for the CNT result (which might then
4215          be shifted).  */
4216       count += 1;
4217     }
4218   return count + aarch64_add_offset_1_temporaries (constant);
4219 }
4220
4221 /* If X can be represented as a poly_int64, return the number
4222    of temporaries that are required to add it to a register.
4223    Return -1 otherwise.  */
4224
4225 int
4226 aarch64_add_offset_temporaries (rtx x)
4227 {
4228   poly_int64 offset;
4229   if (!poly_int_rtx_p (x, &offset))
4230     return -1;
4231   return aarch64_offset_temporaries (true, offset);
4232 }
4233
4234 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
4235    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4236    be set and CFA adjustments added to the generated instructions.
4237
4238    TEMP1, if nonnull, is a register of mode MODE that can be used as a
4239    temporary if register allocation is already complete.  This temporary
4240    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
4241    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
4242    false to avoid emitting the immediate again.
4243
4244    TEMP2, if nonnull, is a second temporary register that doesn't
4245    overlap either DEST or REG.
4246
4247    Since this function may be used to adjust the stack pointer, we must
4248    ensure that it cannot cause transient stack deallocation (for example
4249    by first incrementing SP and then decrementing when adjusting by a
4250    large immediate).  */
4251
4252 static void
4253 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4254                     poly_int64 offset, rtx temp1, rtx temp2,
4255                     bool frame_related_p, bool emit_move_imm = true)
4256 {
4257   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4258   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4259   gcc_assert (temp1 == NULL_RTX
4260               || !frame_related_p
4261               || !reg_overlap_mentioned_p (temp1, dest));
4262   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
4263
4264   /* Try using ADDVL or ADDPL to add the whole value.  */
4265   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
4266     {
4267       rtx offset_rtx = gen_int_mode (offset, mode);
4268       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4269       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4270       return;
4271     }
4272
4273   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
4274      SVE vector register, over and above the minimum size of 128 bits.
4275      This is equivalent to half the value returned by CNTD with a
4276      vector shape of ALL.  */
4277   HOST_WIDE_INT factor = offset.coeffs[1];
4278   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4279
4280   /* Try using ADDVL or ADDPL to add the VG-based part.  */
4281   poly_int64 poly_offset (factor, factor);
4282   if (src != const0_rtx
4283       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4284     {
4285       rtx offset_rtx = gen_int_mode (poly_offset, mode);
4286       if (frame_related_p)
4287         {
4288           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4289           RTX_FRAME_RELATED_P (insn) = true;
4290           src = dest;
4291         }
4292       else
4293         {
4294           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
4295           src = aarch64_force_temporary (mode, temp1, addr);
4296           temp1 = temp2;
4297           temp2 = NULL_RTX;
4298         }
4299     }
4300   /* Otherwise use a CNT-based sequence.  */
4301   else if (factor != 0)
4302     {
4303       /* Use a subtraction if we have a negative factor.  */
4304       rtx_code code = PLUS;
4305       if (factor < 0)
4306         {
4307           factor = -factor;
4308           code = MINUS;
4309         }
4310
4311       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
4312          into the multiplication.  */
4313       rtx val;
4314       int shift = 0;
4315       if (factor & 1)
4316         /* Use a right shift by 1.  */
4317         shift = -1;
4318       else
4319         factor /= 2;
4320       HOST_WIDE_INT low_bit = factor & -factor;
4321       if (factor <= 16 * low_bit)
4322         {
4323           if (factor > 16 * 8)
4324             {
4325               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
4326                  the value with the minimum multiplier and shift it into
4327                  position.  */
4328               int extra_shift = exact_log2 (low_bit);
4329               shift += extra_shift;
4330               factor >>= extra_shift;
4331             }
4332           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
4333         }
4334       else
4335         {
4336           /* Base the factor on LOW_BIT if we can calculate LOW_BIT
4337              directly, since that should increase the chances of being
4338              able to use a shift and add sequence.  If LOW_BIT itself
4339              is out of range, just use CNTD.  */
4340           if (low_bit <= 16 * 8)
4341             factor /= low_bit;
4342           else
4343             low_bit = 1;
4344
4345           val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
4346           val = aarch64_force_temporary (mode, temp1, val);
4347
4348           if (can_create_pseudo_p ())
4349             {
4350               rtx coeff1 = gen_int_mode (factor, mode);
4351               val = expand_mult (mode, val, coeff1, NULL_RTX, false, true);
4352             }
4353           else
4354             {
4355               /* Go back to using a negative multiplication factor if we have
4356                  no register from which to subtract.  */
4357               if (code == MINUS && src == const0_rtx)
4358                 {
4359                   factor = -factor;
4360                   code = PLUS;
4361                 }
4362               rtx coeff1 = gen_int_mode (factor, mode);
4363               coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
4364               val = gen_rtx_MULT (mode, val, coeff1);
4365             }
4366         }
4367
4368       if (shift > 0)
4369         {
4370           /* Multiply by 1 << SHIFT.  */
4371           val = aarch64_force_temporary (mode, temp1, val);
4372           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
4373         }
4374       else if (shift == -1)
4375         {
4376           /* Divide by 2.  */
4377           val = aarch64_force_temporary (mode, temp1, val);
4378           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
4379         }
4380
4381       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
4382       if (src != const0_rtx)
4383         {
4384           val = aarch64_force_temporary (mode, temp1, val);
4385           val = gen_rtx_fmt_ee (code, mode, src, val);
4386         }
4387       else if (code == MINUS)
4388         {
4389           val = aarch64_force_temporary (mode, temp1, val);
4390           val = gen_rtx_NEG (mode, val);
4391         }
4392
4393       if (constant == 0 || frame_related_p)
4394         {
4395           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
4396           if (frame_related_p)
4397             {
4398               RTX_FRAME_RELATED_P (insn) = true;
4399               add_reg_note (insn, REG_CFA_ADJUST_CFA,
4400                             gen_rtx_SET (dest, plus_constant (Pmode, src,
4401                                                               poly_offset)));
4402             }
4403           src = dest;
4404           if (constant == 0)
4405             return;
4406         }
4407       else
4408         {
4409           src = aarch64_force_temporary (mode, temp1, val);
4410           temp1 = temp2;
4411           temp2 = NULL_RTX;
4412         }
4413
4414       emit_move_imm = true;
4415     }
4416
4417   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
4418                         frame_related_p, emit_move_imm);
4419 }
4420
4421 /* Like aarch64_add_offset, but the offset is given as an rtx rather
4422    than a poly_int64.  */
4423
4424 void
4425 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4426                           rtx offset_rtx, rtx temp1, rtx temp2)
4427 {
4428   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
4429                       temp1, temp2, false);
4430 }
4431
4432 /* Add DELTA to the stack pointer, marking the instructions frame-related.
4433    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
4434    if TEMP1 already contains abs (DELTA).  */
4435
4436 static inline void
4437 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
4438 {
4439   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
4440                       temp1, temp2, true, emit_move_imm);
4441 }
4442
4443 /* Subtract DELTA from the stack pointer, marking the instructions
4444    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
4445    if nonnull.  */
4446
4447 static inline void
4448 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
4449                 bool emit_move_imm = true)
4450 {
4451   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
4452                       temp1, temp2, frame_related_p, emit_move_imm);
4453 }
4454
4455 /* Set DEST to (vec_series BASE STEP).  */
4456
4457 static void
4458 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
4459 {
4460   machine_mode mode = GET_MODE (dest);
4461   scalar_mode inner = GET_MODE_INNER (mode);
4462
4463   /* Each operand can be a register or an immediate in the range [-16, 15].  */
4464   if (!aarch64_sve_index_immediate_p (base))
4465     base = force_reg (inner, base);
4466   if (!aarch64_sve_index_immediate_p (step))
4467     step = force_reg (inner, step);
4468
4469   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
4470 }
4471
4472 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
4473    register of mode MODE.  Use TARGET for the result if it's nonnull
4474    and convenient.
4475
4476    The two vector modes must have the same element mode.  The behavior
4477    is to duplicate architectural lane N of SRC into architectural lanes
4478    N + I * STEP of the result.  On big-endian targets, architectural
4479    lane 0 of an Advanced SIMD vector is the last element of the vector
4480    in memory layout, so for big-endian targets this operation has the
4481    effect of reversing SRC before duplicating it.  Callers need to
4482    account for this.  */
4483
4484 rtx
4485 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
4486 {
4487   machine_mode src_mode = GET_MODE (src);
4488   gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
4489   insn_code icode = (BYTES_BIG_ENDIAN
4490                      ? code_for_aarch64_vec_duplicate_vq_be (mode)
4491                      : code_for_aarch64_vec_duplicate_vq_le (mode));
4492
4493   unsigned int i = 0;
4494   expand_operand ops[3];
4495   create_output_operand (&ops[i++], target, mode);
4496   create_output_operand (&ops[i++], src, src_mode);
4497   if (BYTES_BIG_ENDIAN)
4498     {
4499       /* Create a PARALLEL describing the reversal of SRC.  */
4500       unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
4501       rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
4502                                                   nelts_per_vq - 1, -1);
4503       create_fixed_operand (&ops[i++], sel);
4504     }
4505   expand_insn (icode, i, ops);
4506   return ops[0].value;
4507 }
4508
4509 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
4510    the memory image into DEST.  Return true on success.  */
4511
4512 static bool
4513 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
4514 {
4515   src = force_const_mem (GET_MODE (src), src);
4516   if (!src)
4517     return false;
4518
4519   /* Make sure that the address is legitimate.  */
4520   if (!aarch64_sve_ld1rq_operand_p (src))
4521     {
4522       rtx addr = force_reg (Pmode, XEXP (src, 0));
4523       src = replace_equiv_address (src, addr);
4524     }
4525
4526   machine_mode mode = GET_MODE (dest);
4527   machine_mode pred_mode = aarch64_sve_pred_mode (mode);
4528   rtx ptrue = aarch64_ptrue_reg (pred_mode);
4529   emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
4530   return true;
4531 }
4532
4533 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
4534    SVE data mode and isn't a legitimate constant.  Use TARGET for the
4535    result if convenient.
4536
4537    The returned register can have whatever mode seems most natural
4538    given the contents of SRC.  */
4539
4540 static rtx
4541 aarch64_expand_sve_const_vector (rtx target, rtx src)
4542 {
4543   machine_mode mode = GET_MODE (src);
4544   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
4545   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
4546   scalar_mode elt_mode = GET_MODE_INNER (mode);
4547   unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
4548   unsigned int container_bits = aarch64_sve_container_bits (mode);
4549   unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
4550
4551   if (nelts_per_pattern == 1
4552       && encoded_bits <= 128
4553       && container_bits != elt_bits)
4554     {
4555       /* We have a partial vector mode and a constant whose full-vector
4556          equivalent would occupy a repeating 128-bit sequence.  Build that
4557          full-vector equivalent instead, so that we have the option of
4558          using LD1RQ and Advanced SIMD operations.  */
4559       unsigned int repeat = container_bits / elt_bits;
4560       machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
4561       rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
4562       for (unsigned int i = 0; i < npatterns; ++i)
4563         for (unsigned int j = 0; j < repeat; ++j)
4564           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
4565       target = aarch64_target_reg (target, full_mode);
4566       return aarch64_expand_sve_const_vector (target, builder.build ());
4567     }
4568
4569   if (nelts_per_pattern == 1 && encoded_bits == 128)
4570     {
4571       /* The constant is a duplicated quadword but can't be narrowed
4572          beyond a quadword.  Get the memory image of the first quadword
4573          as a 128-bit vector and try using LD1RQ to load it from memory.
4574
4575          The effect for both endiannesses is to load memory lane N into
4576          architectural lanes N + I * STEP of the result.  On big-endian
4577          targets, the layout of the 128-bit vector in an Advanced SIMD
4578          register would be different from its layout in an SVE register,
4579          but this 128-bit vector is a memory value only.  */
4580       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
4581       rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
4582       if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
4583         return target;
4584     }
4585
4586   if (nelts_per_pattern == 1 && encoded_bits < 128)
4587     {
4588       /* The vector is a repeating sequence of 64 bits or fewer.
4589          See if we can load them using an Advanced SIMD move and then
4590          duplicate it to fill a vector.  This is better than using a GPR
4591          move because it keeps everything in the same register file.  */
4592       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
4593       rtx_vector_builder builder (vq_mode, npatterns, 1);
4594       for (unsigned int i = 0; i < npatterns; ++i)
4595         {
4596           /* We want memory lane N to go into architectural lane N,
4597              so reverse for big-endian targets.  The DUP .Q pattern
4598              has a compensating reverse built-in.  */
4599           unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
4600           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
4601         }
4602       rtx vq_src = builder.build ();
4603       if (aarch64_simd_valid_immediate (vq_src, NULL))
4604         {
4605           vq_src = force_reg (vq_mode, vq_src);
4606           return aarch64_expand_sve_dupq (target, mode, vq_src);
4607         }
4608
4609       /* Get an integer representation of the repeating part of Advanced
4610          SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
4611          which for big-endian targets is lane-swapped wrt a normal
4612          Advanced SIMD vector.  This means that for both endiannesses,
4613          memory lane N of SVE vector SRC corresponds to architectural
4614          lane N of a register holding VQ_SRC.  This in turn means that
4615          memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
4616          as a single 128-bit value) and thus that memory lane 0 of SRC is
4617          in the lsb of the integer.  Duplicating the integer therefore
4618          ensures that memory lane N of SRC goes into architectural lane
4619          N + I * INDEX of the SVE register.  */
4620       scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
4621       rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
4622       if (elt_value)
4623         {
4624           /* Pretend that we had a vector of INT_MODE to start with.  */
4625           elt_mode = int_mode;
4626           mode = aarch64_full_sve_mode (int_mode).require ();
4627
4628           /* If the integer can be moved into a general register by a
4629              single instruction, do that and duplicate the result.  */
4630           if (CONST_INT_P (elt_value)
4631               && aarch64_move_imm (INTVAL (elt_value), elt_mode))
4632             {
4633               elt_value = force_reg (elt_mode, elt_value);
4634               return expand_vector_broadcast (mode, elt_value);
4635             }
4636         }
4637       else if (npatterns == 1)
4638         /* We're duplicating a single value, but can't do better than
4639            force it to memory and load from there.  This handles things
4640            like symbolic constants.  */
4641         elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
4642
4643       if (elt_value)
4644         {
4645           /* Load the element from memory if we can, otherwise move it into
4646              a register and use a DUP.  */
4647           rtx op = force_const_mem (elt_mode, elt_value);
4648           if (!op)
4649             op = force_reg (elt_mode, elt_value);
4650           return expand_vector_broadcast (mode, op);
4651         }
4652     }
4653
4654   /* Try using INDEX.  */
4655   rtx base, step;
4656   if (const_vec_series_p (src, &base, &step))
4657     {
4658       aarch64_expand_vec_series (target, base, step);
4659       return target;
4660     }
4661
4662   /* From here on, it's better to force the whole constant to memory
4663      if we can.  */
4664   if (GET_MODE_NUNITS (mode).is_constant ())
4665     return NULL_RTX;
4666
4667   /* Expand each pattern individually.  */
4668   gcc_assert (npatterns > 1);
4669   rtx_vector_builder builder;
4670   auto_vec<rtx, 16> vectors (npatterns);
4671   for (unsigned int i = 0; i < npatterns; ++i)
4672     {
4673       builder.new_vector (mode, 1, nelts_per_pattern);
4674       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
4675         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
4676       vectors.quick_push (force_reg (mode, builder.build ()));
4677     }
4678
4679   /* Use permutes to interleave the separate vectors.  */
4680   while (npatterns > 1)
4681     {
4682       npatterns /= 2;
4683       for (unsigned int i = 0; i < npatterns; ++i)
4684         {
4685           rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
4686           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
4687           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
4688           vectors[i] = tmp;
4689         }
4690     }
4691   gcc_assert (vectors[0] == target);
4692   return target;
4693 }
4694
4695 /* Use WHILE to set a predicate register of mode MODE in which the first
4696    VL bits are set and the rest are clear.  Use TARGET for the register
4697    if it's nonnull and convenient.  */
4698
4699 static rtx
4700 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
4701                                  unsigned int vl)
4702 {
4703   rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
4704   target = aarch64_target_reg (target, mode);
4705   emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
4706                         target, const0_rtx, limit));
4707   return target;
4708 }
4709
4710 static rtx
4711 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
4712
4713 /* BUILDER is a constant predicate in which the index of every set bit
4714    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
4715    by inverting every element at a multiple of ELT_SIZE and EORing the
4716    result with an ELT_SIZE PTRUE.
4717
4718    Return a register that contains the constant on success, otherwise
4719    return null.  Use TARGET as the register if it is nonnull and
4720    convenient.  */
4721
4722 static rtx
4723 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
4724                                    unsigned int elt_size)
4725 {
4726   /* Invert every element at a multiple of ELT_SIZE, keeping the
4727      other bits zero.  */
4728   rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
4729                                   builder.nelts_per_pattern ());
4730   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4731     if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
4732       inv_builder.quick_push (const1_rtx);
4733     else
4734       inv_builder.quick_push (const0_rtx);
4735   inv_builder.finalize ();
4736
4737   /* See if we can load the constant cheaply.  */
4738   rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
4739   if (!inv)
4740     return NULL_RTX;
4741
4742   /* EOR the result with an ELT_SIZE PTRUE.  */
4743   rtx mask = aarch64_ptrue_all (elt_size);
4744   mask = force_reg (VNx16BImode, mask);
4745   target = aarch64_target_reg (target, VNx16BImode);
4746   emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
4747   return target;
4748 }
4749
4750 /* BUILDER is a constant predicate in which the index of every set bit
4751    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
4752    using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE.  Return the
4753    register on success, otherwise return null.  Use TARGET as the register
4754    if nonnull and convenient.  */
4755
4756 static rtx
4757 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
4758                                    unsigned int elt_size,
4759                                    unsigned int permute_size)
4760 {
4761   /* We're going to split the constant into two new constants A and B,
4762      with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
4763      and into B otherwise.  E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
4764
4765      A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
4766      B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
4767
4768      where _ indicates elements that will be discarded by the permute.
4769
4770      First calculate the ELT_SIZEs for A and B.  */
4771   unsigned int a_elt_size = GET_MODE_SIZE (DImode);
4772   unsigned int b_elt_size = GET_MODE_SIZE (DImode);
4773   for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
4774     if (INTVAL (builder.elt (i)) != 0)
4775       {
4776         if (i & permute_size)
4777           b_elt_size |= i - permute_size;
4778         else
4779           a_elt_size |= i;
4780       }
4781   a_elt_size &= -a_elt_size;
4782   b_elt_size &= -b_elt_size;
4783
4784   /* Now construct the vectors themselves.  */
4785   rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
4786                                 builder.nelts_per_pattern ());
4787   rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
4788                                 builder.nelts_per_pattern ());
4789   unsigned int nelts = builder.encoded_nelts ();
4790   for (unsigned int i = 0; i < nelts; ++i)
4791     if (i & (elt_size - 1))
4792       {
4793         a_builder.quick_push (const0_rtx);
4794         b_builder.quick_push (const0_rtx);
4795       }
4796     else if ((i & permute_size) == 0)
4797       {
4798         /* The A and B elements are significant.  */
4799         a_builder.quick_push (builder.elt (i));
4800         b_builder.quick_push (builder.elt (i + permute_size));
4801       }
4802     else
4803       {
4804         /* The A and B elements are going to be discarded, so pick whatever
4805            is likely to give a nice constant.  We are targeting element
4806            sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
4807            with the aim of each being a sequence of ones followed by
4808            a sequence of zeros.  So:
4809
4810            * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
4811              duplicate the last X_ELT_SIZE element, to extend the
4812              current sequence of ones or zeros.
4813
4814            * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
4815              zero, so that the constant really does have X_ELT_SIZE and
4816              not a smaller size.  */
4817         if (a_elt_size > permute_size)
4818           a_builder.quick_push (const0_rtx);
4819         else
4820           a_builder.quick_push (a_builder.elt (i - a_elt_size));
4821         if (b_elt_size > permute_size)
4822           b_builder.quick_push (const0_rtx);
4823         else
4824           b_builder.quick_push (b_builder.elt (i - b_elt_size));
4825       }
4826   a_builder.finalize ();
4827   b_builder.finalize ();
4828
4829   /* Try loading A into a register.  */
4830   rtx_insn *last = get_last_insn ();
4831   rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
4832   if (!a)
4833     return NULL_RTX;
4834
4835   /* Try loading B into a register.  */
4836   rtx b = a;
4837   if (a_builder != b_builder)
4838     {
4839       b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
4840       if (!b)
4841         {
4842           delete_insns_since (last);
4843           return NULL_RTX;
4844         }
4845     }
4846
4847   /* Emit the TRN1 itself.  */
4848   machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
4849   target = aarch64_target_reg (target, mode);
4850   emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
4851                               gen_lowpart (mode, a),
4852                               gen_lowpart (mode, b)));
4853   return target;
4854 }
4855
4856 /* Subroutine of aarch64_expand_sve_const_pred.  Try to load the VNx16BI
4857    constant in BUILDER into an SVE predicate register.  Return the register
4858    on success, otherwise return null.  Use TARGET for the register if
4859    nonnull and convenient.
4860
4861    ALLOW_RECURSE_P is true if we can use methods that would call this
4862    function recursively.  */
4863
4864 static rtx
4865 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
4866                                  bool allow_recurse_p)
4867 {
4868   if (builder.encoded_nelts () == 1)
4869     /* A PFALSE or a PTRUE .B ALL.  */
4870     return aarch64_emit_set_immediate (target, builder);
4871
4872   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
4873   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
4874     {
4875       /* If we can load the constant using PTRUE, use it as-is.  */
4876       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
4877       if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
4878         return aarch64_emit_set_immediate (target, builder);
4879
4880       /* Otherwise use WHILE to set the first VL bits.  */
4881       return aarch64_sve_move_pred_via_while (target, mode, vl);
4882     }
4883
4884   if (!allow_recurse_p)
4885     return NULL_RTX;
4886
4887   /* Try inverting the vector in element size ELT_SIZE and then EORing
4888      the result with an ELT_SIZE PTRUE.  */
4889   if (INTVAL (builder.elt (0)) == 0)
4890     if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
4891                                                      elt_size))
4892       return res;
4893
4894   /* Try using TRN1 to permute two simpler constants.  */
4895   for (unsigned int i = elt_size; i <= 8; i *= 2)
4896     if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
4897                                                      elt_size, i))
4898       return res;
4899
4900   return NULL_RTX;
4901 }
4902
4903 /* Return an SVE predicate register that contains the VNx16BImode
4904    constant in BUILDER, without going through the move expanders.
4905
4906    The returned register can have whatever mode seems most natural
4907    given the contents of BUILDER.  Use TARGET for the result if
4908    convenient.  */
4909
4910 static rtx
4911 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
4912 {
4913   /* Try loading the constant using pure predicate operations.  */
4914   if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
4915     return res;
4916
4917   /* Try forcing the constant to memory.  */
4918   if (builder.full_nelts ().is_constant ())
4919     if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
4920       {
4921         target = aarch64_target_reg (target, VNx16BImode);
4922         emit_move_insn (target, mem);
4923         return target;
4924       }
4925
4926   /* The last resort is to load the constant as an integer and then
4927      compare it against zero.  Use -1 for set bits in order to increase
4928      the changes of using SVE DUPM or an Advanced SIMD byte mask.  */
4929   rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
4930                                   builder.nelts_per_pattern ());
4931   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4932     int_builder.quick_push (INTVAL (builder.elt (i))
4933                             ? constm1_rtx : const0_rtx);
4934   return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
4935                                            int_builder.build ());
4936 }
4937
4938 /* Set DEST to immediate IMM.  */
4939
4940 void
4941 aarch64_expand_mov_immediate (rtx dest, rtx imm)
4942 {
4943   machine_mode mode = GET_MODE (dest);
4944
4945   /* Check on what type of symbol it is.  */
4946   scalar_int_mode int_mode;
4947   if ((GET_CODE (imm) == SYMBOL_REF
4948        || GET_CODE (imm) == LABEL_REF
4949        || GET_CODE (imm) == CONST
4950        || GET_CODE (imm) == CONST_POLY_INT)
4951       && is_a <scalar_int_mode> (mode, &int_mode))
4952     {
4953       rtx mem;
4954       poly_int64 offset;
4955       HOST_WIDE_INT const_offset;
4956       enum aarch64_symbol_type sty;
4957
4958       /* If we have (const (plus symbol offset)), separate out the offset
4959          before we start classifying the symbol.  */
4960       rtx base = strip_offset (imm, &offset);
4961
4962       /* We must always add an offset involving VL separately, rather than
4963          folding it into the relocation.  */
4964       if (!offset.is_constant (&const_offset))
4965         {
4966           if (!TARGET_SVE)
4967             {
4968               aarch64_report_sve_required ();
4969               return;
4970             }
4971           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
4972             emit_insn (gen_rtx_SET (dest, imm));
4973           else
4974             {
4975               /* Do arithmetic on 32-bit values if the result is smaller
4976                  than that.  */
4977               if (partial_subreg_p (int_mode, SImode))
4978                 {
4979                   /* It is invalid to do symbol calculations in modes
4980                      narrower than SImode.  */
4981                   gcc_assert (base == const0_rtx);
4982                   dest = gen_lowpart (SImode, dest);
4983                   int_mode = SImode;
4984                 }
4985               if (base != const0_rtx)
4986                 {
4987                   base = aarch64_force_temporary (int_mode, dest, base);
4988                   aarch64_add_offset (int_mode, dest, base, offset,
4989                                       NULL_RTX, NULL_RTX, false);
4990                 }
4991               else
4992                 aarch64_add_offset (int_mode, dest, base, offset,
4993                                     dest, NULL_RTX, false);
4994             }
4995           return;
4996         }
4997
4998       sty = aarch64_classify_symbol (base, const_offset);
4999       switch (sty)
5000         {
5001         case SYMBOL_FORCE_TO_MEM:
5002           if (const_offset != 0
5003               && targetm.cannot_force_const_mem (int_mode, imm))
5004             {
5005               gcc_assert (can_create_pseudo_p ());
5006               base = aarch64_force_temporary (int_mode, dest, base);
5007               aarch64_add_offset (int_mode, dest, base, const_offset,
5008                                   NULL_RTX, NULL_RTX, false);
5009               return;
5010             }
5011
5012           mem = force_const_mem (ptr_mode, imm);
5013           gcc_assert (mem);
5014
5015           /* If we aren't generating PC relative literals, then
5016              we need to expand the literal pool access carefully.
5017              This is something that needs to be done in a number
5018              of places, so could well live as a separate function.  */
5019           if (!aarch64_pcrelative_literal_loads)
5020             {
5021               gcc_assert (can_create_pseudo_p ());
5022               base = gen_reg_rtx (ptr_mode);
5023               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
5024               if (ptr_mode != Pmode)
5025                 base = convert_memory_address (Pmode, base);
5026               mem = gen_rtx_MEM (ptr_mode, base);
5027             }
5028
5029           if (int_mode != ptr_mode)
5030             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
5031
5032           emit_insn (gen_rtx_SET (dest, mem));
5033
5034           return;
5035
5036         case SYMBOL_SMALL_TLSGD:
5037         case SYMBOL_SMALL_TLSDESC:
5038         case SYMBOL_SMALL_TLSIE:
5039         case SYMBOL_SMALL_GOT_28K:
5040         case SYMBOL_SMALL_GOT_4G:
5041         case SYMBOL_TINY_GOT:
5042         case SYMBOL_TINY_TLSIE:
5043           if (const_offset != 0)
5044             {
5045               gcc_assert(can_create_pseudo_p ());
5046               base = aarch64_force_temporary (int_mode, dest, base);
5047               aarch64_add_offset (int_mode, dest, base, const_offset,
5048                                   NULL_RTX, NULL_RTX, false);
5049               return;
5050             }
5051           /* FALLTHRU */
5052
5053         case SYMBOL_SMALL_ABSOLUTE:
5054         case SYMBOL_TINY_ABSOLUTE:
5055         case SYMBOL_TLSLE12:
5056         case SYMBOL_TLSLE24:
5057         case SYMBOL_TLSLE32:
5058         case SYMBOL_TLSLE48:
5059           aarch64_load_symref_appropriately (dest, imm, sty);
5060           return;
5061
5062         default:
5063           gcc_unreachable ();
5064         }
5065     }
5066
5067   if (!CONST_INT_P (imm))
5068     {
5069       if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
5070         {
5071           /* Only the low bit of each .H, .S and .D element is defined,
5072              so we can set the upper bits to whatever we like.  If the
5073              predicate is all-true in MODE, prefer to set all the undefined
5074              bits as well, so that we can share a single .B predicate for
5075              all modes.  */
5076           if (imm == CONSTM1_RTX (mode))
5077             imm = CONSTM1_RTX (VNx16BImode);
5078
5079           /* All methods for constructing predicate modes wider than VNx16BI
5080              will set the upper bits of each element to zero.  Expose this
5081              by moving such constants as a VNx16BI, so that all bits are
5082              significant and so that constants for different modes can be
5083              shared.  The wider constant will still be available as a
5084              REG_EQUAL note.  */
5085           rtx_vector_builder builder;
5086           if (aarch64_get_sve_pred_bits (builder, imm))
5087             {
5088               rtx res = aarch64_expand_sve_const_pred (dest, builder);
5089               if (dest != res)
5090                 emit_move_insn (dest, gen_lowpart (mode, res));
5091               return;
5092             }
5093         }
5094
5095       if (GET_CODE (imm) == HIGH
5096           || aarch64_simd_valid_immediate (imm, NULL))
5097         {
5098           emit_insn (gen_rtx_SET (dest, imm));
5099           return;
5100         }
5101
5102       if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
5103         if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
5104           {
5105             if (dest != res)
5106               emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
5107             return;
5108           }
5109
5110       rtx mem = force_const_mem (mode, imm);
5111       gcc_assert (mem);
5112       emit_move_insn (dest, mem);
5113       return;
5114     }
5115
5116   aarch64_internal_mov_immediate (dest, imm, true,
5117                                   as_a <scalar_int_mode> (mode));
5118 }
5119
5120 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
5121    that is known to contain PTRUE.  */
5122
5123 void
5124 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
5125 {
5126   expand_operand ops[3];
5127   machine_mode mode = GET_MODE (dest);
5128   create_output_operand (&ops[0], dest, mode);
5129   create_input_operand (&ops[1], pred, GET_MODE(pred));
5130   create_input_operand (&ops[2], src, mode);
5131   temporary_volatile_ok v (true);
5132   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
5133 }
5134
5135 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
5136    operand is in memory.  In this case we need to use the predicated LD1
5137    and ST1 instead of LDR and STR, both for correctness on big-endian
5138    targets and because LD1 and ST1 support a wider range of addressing modes.
5139    PRED_MODE is the mode of the predicate.
5140
5141    See the comment at the head of aarch64-sve.md for details about the
5142    big-endian handling.  */
5143
5144 void
5145 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
5146 {
5147   machine_mode mode = GET_MODE (dest);
5148   rtx ptrue = aarch64_ptrue_reg (pred_mode);
5149   if (!register_operand (src, mode)
5150       && !register_operand (dest, mode))
5151     {
5152       rtx tmp = gen_reg_rtx (mode);
5153       if (MEM_P (src))
5154         aarch64_emit_sve_pred_move (tmp, ptrue, src);
5155       else
5156         emit_move_insn (tmp, src);
5157       src = tmp;
5158     }
5159   aarch64_emit_sve_pred_move (dest, ptrue, src);
5160 }
5161
5162 /* Called only on big-endian targets.  See whether an SVE vector move
5163    from SRC to DEST is effectively a REV[BHW] instruction, because at
5164    least one operand is a subreg of an SVE vector that has wider or
5165    narrower elements.  Return true and emit the instruction if so.
5166
5167    For example:
5168
5169      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
5170
5171    represents a VIEW_CONVERT between the following vectors, viewed
5172    in memory order:
5173
5174      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
5175      R1: { [0],      [1],      [2],      [3],     ... }
5176
5177    The high part of lane X in R2 should therefore correspond to lane X*2
5178    of R1, but the register representations are:
5179
5180          msb                                      lsb
5181      R2: ...... [1].high  [1].low   [0].high  [0].low
5182      R1: ...... [3]       [2]       [1]       [0]
5183
5184    where the low part of lane X in R2 corresponds to lane X*2 in R1.
5185    We therefore need a reverse operation to swap the high and low values
5186    around.
5187
5188    This is purely an optimization.  Without it we would spill the
5189    subreg operand to the stack in one mode and reload it in the
5190    other mode, which has the same effect as the REV.  */
5191
5192 bool
5193 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
5194 {
5195   gcc_assert (BYTES_BIG_ENDIAN);
5196   if (GET_CODE (dest) == SUBREG)
5197     dest = SUBREG_REG (dest);
5198   if (GET_CODE (src) == SUBREG)
5199     src = SUBREG_REG (src);
5200
5201   /* The optimization handles two single SVE REGs with different element
5202      sizes.  */
5203   if (!REG_P (dest)
5204       || !REG_P (src)
5205       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
5206       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
5207       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
5208           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
5209     return false;
5210
5211   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
5212   rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
5213   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
5214                                UNSPEC_REV_SUBREG);
5215   emit_insn (gen_rtx_SET (dest, unspec));
5216   return true;
5217 }
5218
5219 /* Return a copy of X with mode MODE, without changing its other
5220    attributes.  Unlike gen_lowpart, this doesn't care whether the
5221    mode change is valid.  */
5222
5223 rtx
5224 aarch64_replace_reg_mode (rtx x, machine_mode mode)
5225 {
5226   if (GET_MODE (x) == mode)
5227     return x;
5228
5229   x = shallow_copy_rtx (x);
5230   set_mode_and_regno (x, mode, REGNO (x));
5231   return x;
5232 }
5233
5234 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
5235    stored in wider integer containers.  */
5236
5237 static unsigned int
5238 aarch64_sve_rev_unspec (machine_mode mode)
5239 {
5240   switch (GET_MODE_UNIT_SIZE (mode))
5241     {
5242     case 1: return UNSPEC_REVB;
5243     case 2: return UNSPEC_REVH;
5244     case 4: return UNSPEC_REVW;
5245     }
5246   gcc_unreachable ();
5247 }
5248
5249 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
5250    operands.  */
5251
5252 void
5253 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
5254 {
5255   /* Decide which REV operation we need.  The mode with wider elements
5256      determines the mode of the operands and the mode with the narrower
5257      elements determines the reverse width.  */
5258   machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
5259   machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
5260   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
5261       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
5262     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
5263
5264   unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
5265   machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
5266
5267   /* Get the operands in the appropriate modes and emit the instruction.  */
5268   ptrue = gen_lowpart (pred_mode, ptrue);
5269   dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
5270   src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
5271   emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
5272                                dest, ptrue, src));
5273 }
5274
5275 static bool
5276 aarch64_function_ok_for_sibcall (tree, tree exp)
5277 {
5278   if (crtl->abi->id () != expr_callee_abi (exp).id ())
5279     return false;
5280
5281   return true;
5282 }
5283
5284 /* Subroutine of aarch64_pass_by_reference for arguments that are not
5285    passed in SVE registers.  */
5286
5287 static bool
5288 aarch64_pass_by_reference_1 (const function_arg_info &arg)
5289 {
5290   HOST_WIDE_INT size;
5291   machine_mode dummymode;
5292   int nregs;
5293
5294   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
5295   if (arg.mode == BLKmode && arg.type)
5296     size = int_size_in_bytes (arg.type);
5297   else
5298     /* No frontends can create types with variable-sized modes, so we
5299        shouldn't be asked to pass or return them.  */
5300     size = GET_MODE_SIZE (arg.mode).to_constant ();
5301
5302   /* Aggregates are passed by reference based on their size.  */
5303   if (arg.aggregate_type_p ())
5304     size = int_size_in_bytes (arg.type);
5305
5306   /* Variable sized arguments are always returned by reference.  */
5307   if (size < 0)
5308     return true;
5309
5310   /* Can this be a candidate to be passed in fp/simd register(s)?  */
5311   if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
5312                                                &dummymode, &nregs,
5313                                                NULL))
5314     return false;
5315
5316   /* Arguments which are variable sized or larger than 2 registers are
5317      passed by reference unless they are a homogenous floating point
5318      aggregate.  */
5319   return size > 2 * UNITS_PER_WORD;
5320 }
5321
5322 /* Implement TARGET_PASS_BY_REFERENCE.  */
5323
5324 static bool
5325 aarch64_pass_by_reference (cumulative_args_t pcum_v,
5326                            const function_arg_info &arg)
5327 {
5328   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5329
5330   if (!arg.type)
5331     return aarch64_pass_by_reference_1 (arg);
5332
5333   pure_scalable_type_info pst_info;
5334   switch (pst_info.analyze (arg.type))
5335     {
5336     case pure_scalable_type_info::IS_PST:
5337       if (pcum && !pcum->silent_p && !TARGET_SVE)
5338         /* We can't gracefully recover at this point, so make this a
5339            fatal error.  */
5340         fatal_error (input_location, "arguments of type %qT require"
5341                      " the SVE ISA extension", arg.type);
5342
5343       /* Variadic SVE types are passed by reference.  Normal non-variadic
5344          arguments are too if we've run out of registers.  */
5345       return (!arg.named
5346               || pcum->aapcs_nvrn + pst_info.num_zr () > NUM_FP_ARG_REGS
5347               || pcum->aapcs_nprn + pst_info.num_pr () > NUM_PR_ARG_REGS);
5348
5349     case pure_scalable_type_info::DOESNT_MATTER:
5350       gcc_assert (aarch64_pass_by_reference_1 (arg));
5351       return true;
5352
5353     case pure_scalable_type_info::NO_ABI_IDENTITY:
5354     case pure_scalable_type_info::ISNT_PST:
5355       return aarch64_pass_by_reference_1 (arg);
5356     }
5357   gcc_unreachable ();
5358 }
5359
5360 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
5361 static bool
5362 aarch64_return_in_msb (const_tree valtype)
5363 {
5364   machine_mode dummy_mode;
5365   int dummy_int;
5366
5367   /* Never happens in little-endian mode.  */
5368   if (!BYTES_BIG_ENDIAN)
5369     return false;
5370
5371   /* Only composite types smaller than or equal to 16 bytes can
5372      be potentially returned in registers.  */
5373   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
5374       || int_size_in_bytes (valtype) <= 0
5375       || int_size_in_bytes (valtype) > 16)
5376     return false;
5377
5378   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
5379      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
5380      is always passed/returned in the least significant bits of fp/simd
5381      register(s).  */
5382   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
5383                                                &dummy_mode, &dummy_int, NULL))
5384     return false;
5385
5386   /* Likewise pure scalable types for SVE vector and predicate registers.  */
5387   pure_scalable_type_info pst_info;
5388   if (pst_info.analyze_registers (valtype))
5389     return false;
5390
5391   return true;
5392 }
5393
5394 /* Implement TARGET_FUNCTION_VALUE.
5395    Define how to find the value returned by a function.  */
5396
5397 static rtx
5398 aarch64_function_value (const_tree type, const_tree func,
5399                         bool outgoing ATTRIBUTE_UNUSED)
5400 {
5401   machine_mode mode;
5402   int unsignedp;
5403
5404   mode = TYPE_MODE (type);
5405   if (INTEGRAL_TYPE_P (type))
5406     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
5407
5408   pure_scalable_type_info pst_info;
5409   if (type && pst_info.analyze_registers (type))
5410     return pst_info.get_rtx (mode, V0_REGNUM, P0_REGNUM);
5411
5412   /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
5413      are returned in memory, not by value.  */
5414   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5415   bool sve_p = (vec_flags & VEC_ANY_SVE);
5416
5417   if (aarch64_return_in_msb (type))
5418     {
5419       HOST_WIDE_INT size = int_size_in_bytes (type);
5420
5421       if (size % UNITS_PER_WORD != 0)
5422         {
5423           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
5424           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
5425         }
5426     }
5427
5428   int count;
5429   machine_mode ag_mode;
5430   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
5431                                                &ag_mode, &count, NULL))
5432     {
5433       gcc_assert (!sve_p);
5434       if (!aarch64_composite_type_p (type, mode))
5435         {
5436           gcc_assert (count == 1 && mode == ag_mode);
5437           return gen_rtx_REG (mode, V0_REGNUM);
5438         }
5439       else
5440         {
5441           int i;
5442           rtx par;
5443
5444           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
5445           for (i = 0; i < count; i++)
5446             {
5447               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
5448               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
5449               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
5450               XVECEXP (par, 0, i) = tmp;
5451             }
5452           return par;
5453         }
5454     }
5455   else
5456     {
5457       if (sve_p)
5458         {
5459           /* Vector types can acquire a partial SVE mode using things like
5460              __attribute__((vector_size(N))), and this is potentially useful.
5461              However, the choice of mode doesn't affect the type's ABI
5462              identity, so we should treat the types as though they had
5463              the associated integer mode, just like they did before SVE
5464              was introduced.
5465
5466              We know that the vector must be 128 bits or smaller,
5467              otherwise we'd have returned it in memory instead.  */
5468           gcc_assert (type
5469                       && (aarch64_some_values_include_pst_objects_p (type)
5470                           || (vec_flags & VEC_PARTIAL)));
5471
5472           scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
5473           rtx reg = gen_rtx_REG (int_mode, R0_REGNUM);
5474           rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
5475           return gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
5476         }
5477       return gen_rtx_REG (mode, R0_REGNUM);
5478     }
5479 }
5480
5481 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
5482    Return true if REGNO is the number of a hard register in which the values
5483    of called function may come back.  */
5484
5485 static bool
5486 aarch64_function_value_regno_p (const unsigned int regno)
5487 {
5488   /* Maximum of 16 bytes can be returned in the general registers.  Examples
5489      of 16-byte return values are: 128-bit integers and 16-byte small
5490      structures (excluding homogeneous floating-point aggregates).  */
5491   if (regno == R0_REGNUM || regno == R1_REGNUM)
5492     return true;
5493
5494   /* Up to four fp/simd registers can return a function value, e.g. a
5495      homogeneous floating-point aggregate having four members.  */
5496   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
5497     return TARGET_FLOAT;
5498
5499   return false;
5500 }
5501
5502 /* Subroutine for aarch64_return_in_memory for types that are not returned
5503    in SVE registers.  */
5504
5505 static bool
5506 aarch64_return_in_memory_1 (const_tree type)
5507 {
5508   HOST_WIDE_INT size;
5509   machine_mode ag_mode;
5510   int count;
5511
5512   if (!AGGREGATE_TYPE_P (type)
5513       && TREE_CODE (type) != COMPLEX_TYPE
5514       && TREE_CODE (type) != VECTOR_TYPE)
5515     /* Simple scalar types always returned in registers.  */
5516     return false;
5517
5518   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
5519                                                type,
5520                                                &ag_mode,
5521                                                &count,
5522                                                NULL))
5523     return false;
5524
5525   /* Types larger than 2 registers returned in memory.  */
5526   size = int_size_in_bytes (type);
5527   return (size < 0 || size > 2 * UNITS_PER_WORD);
5528 }
5529
5530 /* Implement TARGET_RETURN_IN_MEMORY.
5531
5532    If the type T of the result of a function is such that
5533      void func (T arg)
5534    would require that arg be passed as a value in a register (or set of
5535    registers) according to the parameter passing rules, then the result
5536    is returned in the same registers as would be used for such an
5537    argument.  */
5538
5539 static bool
5540 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
5541 {
5542   pure_scalable_type_info pst_info;
5543   switch (pst_info.analyze (type))
5544     {
5545     case pure_scalable_type_info::IS_PST:
5546       return (pst_info.num_zr () > NUM_FP_ARG_REGS
5547               || pst_info.num_pr () > NUM_PR_ARG_REGS);
5548
5549     case pure_scalable_type_info::DOESNT_MATTER:
5550       gcc_assert (aarch64_return_in_memory_1 (type));
5551       return true;
5552
5553     case pure_scalable_type_info::NO_ABI_IDENTITY:
5554     case pure_scalable_type_info::ISNT_PST:
5555       return aarch64_return_in_memory_1 (type);
5556     }
5557   gcc_unreachable ();
5558 }
5559
5560 static bool
5561 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
5562                                const_tree type, int *nregs)
5563 {
5564   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5565   return aarch64_vfp_is_call_or_return_candidate (mode,
5566                                                   type,
5567                                                   &pcum->aapcs_vfp_rmode,
5568                                                   nregs,
5569                                                   NULL);
5570 }
5571
5572 /* Given MODE and TYPE of a function argument, return the alignment in
5573    bits.  The idea is to suppress any stronger alignment requested by
5574    the user and opt for the natural alignment (specified in AAPCS64 \S
5575    4.1).  ABI_BREAK is set to true if the alignment was incorrectly
5576    calculated in versions of GCC prior to GCC-9.  This is a helper
5577    function for local use only.  */
5578
5579 static unsigned int
5580 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
5581                                 bool *abi_break)
5582 {
5583   *abi_break = false;
5584   if (!type)
5585     return GET_MODE_ALIGNMENT (mode);
5586
5587   if (integer_zerop (TYPE_SIZE (type)))
5588     return 0;
5589
5590   gcc_assert (TYPE_MODE (type) == mode);
5591
5592   if (!AGGREGATE_TYPE_P (type))
5593     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
5594
5595   if (TREE_CODE (type) == ARRAY_TYPE)
5596     return TYPE_ALIGN (TREE_TYPE (type));
5597
5598   unsigned int alignment = 0;
5599   unsigned int bitfield_alignment = 0;
5600   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5601     if (TREE_CODE (field) == FIELD_DECL)
5602       {
5603         alignment = std::max (alignment, DECL_ALIGN (field));
5604         if (DECL_BIT_FIELD_TYPE (field))
5605           bitfield_alignment
5606             = std::max (bitfield_alignment,
5607                         TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
5608       }
5609
5610   if (bitfield_alignment > alignment)
5611     {
5612       *abi_break = true;
5613       return bitfield_alignment;
5614     }
5615
5616   return alignment;
5617 }
5618
5619 /* Layout a function argument according to the AAPCS64 rules.  The rule
5620    numbers refer to the rule numbers in the AAPCS64.  ORIG_MODE is the
5621    mode that was originally given to us by the target hook, whereas the
5622    mode in ARG might be the result of replacing partial SVE modes with
5623    the equivalent integer mode.  */
5624
5625 static void
5626 aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
5627 {
5628   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5629   tree type = arg.type;
5630   machine_mode mode = arg.mode;
5631   int ncrn, nvrn, nregs;
5632   bool allocate_ncrn, allocate_nvrn;
5633   HOST_WIDE_INT size;
5634   bool abi_break;
5635
5636   /* We need to do this once per argument.  */
5637   if (pcum->aapcs_arg_processed)
5638     return;
5639
5640   pcum->aapcs_arg_processed = true;
5641
5642   pure_scalable_type_info pst_info;
5643   if (type && pst_info.analyze_registers (type))
5644     {
5645       /* The PCS says that it is invalid to pass an SVE value to an
5646          unprototyped function.  There is no ABI-defined location we
5647          can return in this case, so we have no real choice but to raise
5648          an error immediately, even though this is only a query function.  */
5649       if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
5650         {
5651           gcc_assert (!pcum->silent_p);
5652           error ("SVE type %qT cannot be passed to an unprototyped function",
5653                  arg.type);
5654           /* Avoid repeating the message, and avoid tripping the assert
5655              below.  */
5656           pcum->pcs_variant = ARM_PCS_SVE;
5657         }
5658
5659       /* We would have converted the argument into pass-by-reference
5660          form if it didn't fit in registers.  */
5661       pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + pst_info.num_zr ();
5662       pcum->aapcs_nextnprn = pcum->aapcs_nprn + pst_info.num_pr ();
5663       gcc_assert (arg.named
5664                   && pcum->pcs_variant == ARM_PCS_SVE
5665                   && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
5666                   && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
5667       pcum->aapcs_reg = pst_info.get_rtx (mode, V0_REGNUM + pcum->aapcs_nvrn,
5668                                           P0_REGNUM + pcum->aapcs_nprn);
5669       return;
5670     }
5671
5672   /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
5673      are passed by reference, not by value.  */
5674   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5675   bool sve_p = (vec_flags & VEC_ANY_SVE);
5676   if (sve_p)
5677     /* Vector types can acquire a partial SVE mode using things like
5678        __attribute__((vector_size(N))), and this is potentially useful.
5679        However, the choice of mode doesn't affect the type's ABI
5680        identity, so we should treat the types as though they had
5681        the associated integer mode, just like they did before SVE
5682        was introduced.
5683
5684        We know that the vector must be 128 bits or smaller,
5685        otherwise we'd have passed it in memory instead.  */
5686     gcc_assert (type
5687                 && (aarch64_some_values_include_pst_objects_p (type)
5688                     || (vec_flags & VEC_PARTIAL)));
5689
5690   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
5691   if (type)
5692     size = int_size_in_bytes (type);
5693   else
5694     /* No frontends can create types with variable-sized modes, so we
5695        shouldn't be asked to pass or return them.  */
5696     size = GET_MODE_SIZE (mode).to_constant ();
5697   size = ROUND_UP (size, UNITS_PER_WORD);
5698
5699   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
5700   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
5701                                                  mode,
5702                                                  type,
5703                                                  &nregs);
5704   gcc_assert (!sve_p || !allocate_nvrn);
5705
5706   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
5707      The following code thus handles passing by SIMD/FP registers first.  */
5708
5709   nvrn = pcum->aapcs_nvrn;
5710
5711   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
5712      and homogenous short-vector aggregates (HVA).  */
5713   if (allocate_nvrn)
5714     {
5715       if (!pcum->silent_p && !TARGET_FLOAT)
5716         aarch64_err_no_fpadvsimd (mode);
5717
5718       if (nvrn + nregs <= NUM_FP_ARG_REGS)
5719         {
5720           pcum->aapcs_nextnvrn = nvrn + nregs;
5721           if (!aarch64_composite_type_p (type, mode))
5722             {
5723               gcc_assert (nregs == 1);
5724               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
5725             }
5726           else
5727             {
5728               rtx par;
5729               int i;
5730               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
5731               for (i = 0; i < nregs; i++)
5732                 {
5733                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
5734                                          V0_REGNUM + nvrn + i);
5735                   rtx offset = gen_int_mode
5736                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
5737                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
5738                   XVECEXP (par, 0, i) = tmp;
5739                 }
5740               pcum->aapcs_reg = par;
5741             }
5742           return;
5743         }
5744       else
5745         {
5746           /* C.3 NSRN is set to 8.  */
5747           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
5748           goto on_stack;
5749         }
5750     }
5751
5752   ncrn = pcum->aapcs_ncrn;
5753   nregs = size / UNITS_PER_WORD;
5754
5755   /* C6 - C9.  though the sign and zero extension semantics are
5756      handled elsewhere.  This is the case where the argument fits
5757      entirely general registers.  */
5758   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
5759     {
5760       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
5761
5762       /* C.8 if the argument has an alignment of 16 then the NGRN is
5763          rounded up to the next even number.  */
5764       if (nregs == 2
5765           && ncrn % 2
5766           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
5767              comparison is there because for > 16 * BITS_PER_UNIT
5768              alignment nregs should be > 2 and therefore it should be
5769              passed by reference rather than value.  */
5770           && (aarch64_function_arg_alignment (mode, type, &abi_break)
5771               == 16 * BITS_PER_UNIT))
5772         {
5773           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
5774             inform (input_location, "parameter passing for argument of type "
5775                     "%qT changed in GCC 9.1", type);
5776           ++ncrn;
5777           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
5778         }
5779
5780       /* If an argument with an SVE mode needs to be shifted up to the
5781          high part of the register, treat it as though it had an integer mode.
5782          Using the normal (parallel [...]) would suppress the shifting.  */
5783       if (sve_p
5784           && BYTES_BIG_ENDIAN
5785           && maybe_ne (GET_MODE_SIZE (mode), nregs * UNITS_PER_WORD)
5786           && aarch64_pad_reg_upward (mode, type, false))
5787         {
5788           mode = int_mode_for_mode (mode).require ();
5789           sve_p = false;
5790         }
5791
5792       /* NREGS can be 0 when e.g. an empty structure is to be passed.
5793          A reg is still generated for it, but the caller should be smart
5794          enough not to use it.  */
5795       if (nregs == 0
5796           || (nregs == 1 && !sve_p)
5797           || GET_MODE_CLASS (mode) == MODE_INT)
5798         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
5799       else
5800         {
5801           rtx par;
5802           int i;
5803
5804           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
5805           for (i = 0; i < nregs; i++)
5806             {
5807               scalar_int_mode reg_mode = word_mode;
5808               if (nregs == 1)
5809                 reg_mode = int_mode_for_mode (mode).require ();
5810               rtx tmp = gen_rtx_REG (reg_mode, R0_REGNUM + ncrn + i);
5811               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
5812                                        GEN_INT (i * UNITS_PER_WORD));
5813               XVECEXP (par, 0, i) = tmp;
5814             }
5815           pcum->aapcs_reg = par;
5816         }
5817
5818       pcum->aapcs_nextncrn = ncrn + nregs;
5819       return;
5820     }
5821
5822   /* C.11  */
5823   pcum->aapcs_nextncrn = NUM_ARG_REGS;
5824
5825   /* The argument is passed on stack; record the needed number of words for
5826      this argument and align the total size if necessary.  */
5827 on_stack:
5828   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
5829
5830   if (aarch64_function_arg_alignment (mode, type, &abi_break)
5831       == 16 * BITS_PER_UNIT)
5832     {
5833       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
5834       if (pcum->aapcs_stack_size != new_size)
5835         {
5836           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
5837             inform (input_location, "parameter passing for argument of type "
5838                     "%qT changed in GCC 9.1", type);
5839           pcum->aapcs_stack_size = new_size;
5840         }
5841     }
5842   return;
5843 }
5844
5845 /* Implement TARGET_FUNCTION_ARG.  */
5846
5847 static rtx
5848 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
5849 {
5850   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5851   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
5852               || pcum->pcs_variant == ARM_PCS_SIMD
5853               || pcum->pcs_variant == ARM_PCS_SVE);
5854
5855   if (arg.end_marker_p ())
5856     return gen_int_mode (pcum->pcs_variant, DImode);
5857
5858   aarch64_layout_arg (pcum_v, arg);
5859   return pcum->aapcs_reg;
5860 }
5861
5862 void
5863 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
5864                               const_tree fntype,
5865                               rtx libname ATTRIBUTE_UNUSED,
5866                               const_tree fndecl ATTRIBUTE_UNUSED,
5867                               unsigned n_named ATTRIBUTE_UNUSED,
5868                               bool silent_p)
5869 {
5870   pcum->aapcs_ncrn = 0;
5871   pcum->aapcs_nvrn = 0;
5872   pcum->aapcs_nprn = 0;
5873   pcum->aapcs_nextncrn = 0;
5874   pcum->aapcs_nextnvrn = 0;
5875   pcum->aapcs_nextnprn = 0;
5876   if (fntype)
5877     pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
5878   else
5879     pcum->pcs_variant = ARM_PCS_AAPCS64;
5880   pcum->aapcs_reg = NULL_RTX;
5881   pcum->aapcs_arg_processed = false;
5882   pcum->aapcs_stack_words = 0;
5883   pcum->aapcs_stack_size = 0;
5884   pcum->silent_p = silent_p;
5885
5886   if (!silent_p
5887       && !TARGET_FLOAT
5888       && fndecl && TREE_PUBLIC (fndecl)
5889       && fntype && fntype != error_mark_node)
5890     {
5891       const_tree type = TREE_TYPE (fntype);
5892       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
5893       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
5894       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
5895                                                    &mode, &nregs, NULL))
5896         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
5897     }
5898
5899   if (!silent_p
5900       && !TARGET_SVE
5901       && pcum->pcs_variant == ARM_PCS_SVE)
5902     {
5903       /* We can't gracefully recover at this point, so make this a
5904          fatal error.  */
5905       if (fndecl)
5906         fatal_error (input_location, "%qE requires the SVE ISA extension",
5907                      fndecl);
5908       else
5909         fatal_error (input_location, "calls to functions of type %qT require"
5910                      " the SVE ISA extension", fntype);
5911     }
5912 }
5913
5914 static void
5915 aarch64_function_arg_advance (cumulative_args_t pcum_v,
5916                               const function_arg_info &arg)
5917 {
5918   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5919   if (pcum->pcs_variant == ARM_PCS_AAPCS64
5920       || pcum->pcs_variant == ARM_PCS_SIMD
5921       || pcum->pcs_variant == ARM_PCS_SVE)
5922     {
5923       aarch64_layout_arg (pcum_v, arg);
5924       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
5925                   != (pcum->aapcs_stack_words != 0));
5926       pcum->aapcs_arg_processed = false;
5927       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
5928       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
5929       pcum->aapcs_nprn = pcum->aapcs_nextnprn;
5930       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
5931       pcum->aapcs_stack_words = 0;
5932       pcum->aapcs_reg = NULL_RTX;
5933     }
5934 }
5935
5936 bool
5937 aarch64_function_arg_regno_p (unsigned regno)
5938 {
5939   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
5940           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
5941 }
5942
5943 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
5944    PARM_BOUNDARY bits of alignment, but will be given anything up
5945    to STACK_BOUNDARY bits if the type requires it.  This makes sure
5946    that both before and after the layout of each argument, the Next
5947    Stacked Argument Address (NSAA) will have a minimum alignment of
5948    8 bytes.  */
5949
5950 static unsigned int
5951 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
5952 {
5953   bool abi_break;
5954   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
5955                                                            &abi_break);
5956   if (abi_break & warn_psabi)
5957     inform (input_location, "parameter passing for argument of type "
5958             "%qT changed in GCC 9.1", type);
5959
5960   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
5961 }
5962
5963 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
5964
5965 static fixed_size_mode
5966 aarch64_get_reg_raw_mode (int regno)
5967 {
5968   if (TARGET_SVE && FP_REGNUM_P (regno))
5969     /* Don't use the SVE part of the register for __builtin_apply and
5970        __builtin_return.  The SVE registers aren't used by the normal PCS,
5971        so using them there would be a waste of time.  The PCS extensions
5972        for SVE types are fundamentally incompatible with the
5973        __builtin_return/__builtin_apply interface.  */
5974     return as_a <fixed_size_mode> (V16QImode);
5975   return default_get_reg_raw_mode (regno);
5976 }
5977
5978 /* Implement TARGET_FUNCTION_ARG_PADDING.
5979
5980    Small aggregate types are placed in the lowest memory address.
5981
5982    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
5983
5984 static pad_direction
5985 aarch64_function_arg_padding (machine_mode mode, const_tree type)
5986 {
5987   /* On little-endian targets, the least significant byte of every stack
5988      argument is passed at the lowest byte address of the stack slot.  */
5989   if (!BYTES_BIG_ENDIAN)
5990     return PAD_UPWARD;
5991
5992   /* Otherwise, integral, floating-point and pointer types are padded downward:
5993      the least significant byte of a stack argument is passed at the highest
5994      byte address of the stack slot.  */
5995   if (type
5996       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
5997          || POINTER_TYPE_P (type))
5998       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
5999     return PAD_DOWNWARD;
6000
6001   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
6002   return PAD_UPWARD;
6003 }
6004
6005 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
6006
6007    It specifies padding for the last (may also be the only)
6008    element of a block move between registers and memory.  If
6009    assuming the block is in the memory, padding upward means that
6010    the last element is padded after its highest significant byte,
6011    while in downward padding, the last element is padded at the
6012    its least significant byte side.
6013
6014    Small aggregates and small complex types are always padded
6015    upwards.
6016
6017    We don't need to worry about homogeneous floating-point or
6018    short-vector aggregates; their move is not affected by the
6019    padding direction determined here.  Regardless of endianness,
6020    each element of such an aggregate is put in the least
6021    significant bits of a fp/simd register.
6022
6023    Return !BYTES_BIG_ENDIAN if the least significant byte of the
6024    register has useful data, and return the opposite if the most
6025    significant byte does.  */
6026
6027 bool
6028 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
6029                      bool first ATTRIBUTE_UNUSED)
6030 {
6031
6032   /* Aside from pure scalable types, small composite types are always
6033      padded upward.  */
6034   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
6035     {
6036       HOST_WIDE_INT size;
6037       if (type)
6038         size = int_size_in_bytes (type);
6039       else
6040         /* No frontends can create types with variable-sized modes, so we
6041            shouldn't be asked to pass or return them.  */
6042         size = GET_MODE_SIZE (mode).to_constant ();
6043       if (size < 2 * UNITS_PER_WORD)
6044         {
6045           pure_scalable_type_info pst_info;
6046           if (pst_info.analyze_registers (type))
6047             return false;
6048           return true;
6049         }
6050     }
6051
6052   /* Otherwise, use the default padding.  */
6053   return !BYTES_BIG_ENDIAN;
6054 }
6055
6056 static scalar_int_mode
6057 aarch64_libgcc_cmp_return_mode (void)
6058 {
6059   return SImode;
6060 }
6061
6062 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
6063
6064 /* We use the 12-bit shifted immediate arithmetic instructions so values
6065    must be multiple of (1 << 12), i.e. 4096.  */
6066 #define ARITH_FACTOR 4096
6067
6068 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
6069 #error Cannot use simple address calculation for stack probing
6070 #endif
6071
6072 /* The pair of scratch registers used for stack probing.  */
6073 #define PROBE_STACK_FIRST_REG  R9_REGNUM
6074 #define PROBE_STACK_SECOND_REG R10_REGNUM
6075
6076 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
6077    inclusive.  These are offsets from the current stack pointer.  */
6078
6079 static void
6080 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
6081 {
6082   HOST_WIDE_INT size;
6083   if (!poly_size.is_constant (&size))
6084     {
6085       sorry ("stack probes for SVE frames");
6086       return;
6087     }
6088
6089   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
6090
6091   /* See the same assertion on PROBE_INTERVAL above.  */
6092   gcc_assert ((first % ARITH_FACTOR) == 0);
6093
6094   /* See if we have a constant small number of probes to generate.  If so,
6095      that's the easy case.  */
6096   if (size <= PROBE_INTERVAL)
6097     {
6098       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
6099
6100       emit_set_insn (reg1,
6101                      plus_constant (Pmode,
6102                                     stack_pointer_rtx, -(first + base)));
6103       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
6104     }
6105
6106   /* The run-time loop is made up of 8 insns in the generic case while the
6107      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
6108   else if (size <= 4 * PROBE_INTERVAL)
6109     {
6110       HOST_WIDE_INT i, rem;
6111
6112       emit_set_insn (reg1,
6113                      plus_constant (Pmode,
6114                                     stack_pointer_rtx,
6115                                     -(first + PROBE_INTERVAL)));
6116       emit_stack_probe (reg1);
6117
6118       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
6119          it exceeds SIZE.  If only two probes are needed, this will not
6120          generate any code.  Then probe at FIRST + SIZE.  */
6121       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
6122         {
6123           emit_set_insn (reg1,
6124                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
6125           emit_stack_probe (reg1);
6126         }
6127
6128       rem = size - (i - PROBE_INTERVAL);
6129       if (rem > 256)
6130         {
6131           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
6132
6133           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
6134           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
6135         }
6136       else
6137         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
6138     }
6139
6140   /* Otherwise, do the same as above, but in a loop.  Note that we must be
6141      extra careful with variables wrapping around because we might be at
6142      the very top (or the very bottom) of the address space and we have
6143      to be able to handle this case properly; in particular, we use an
6144      equality test for the loop condition.  */
6145   else
6146     {
6147       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
6148
6149       /* Step 1: round SIZE to the previous multiple of the interval.  */
6150
6151       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
6152
6153
6154       /* Step 2: compute initial and final value of the loop counter.  */
6155
6156       /* TEST_ADDR = SP + FIRST.  */
6157       emit_set_insn (reg1,
6158                      plus_constant (Pmode, stack_pointer_rtx, -first));
6159
6160       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
6161       HOST_WIDE_INT adjustment = - (first + rounded_size);
6162       if (! aarch64_uimm12_shift (adjustment))
6163         {
6164           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
6165                                           true, Pmode);
6166           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
6167         }
6168       else
6169         emit_set_insn (reg2,
6170                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
6171
6172       /* Step 3: the loop
6173
6174          do
6175            {
6176              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
6177              probe at TEST_ADDR
6178            }
6179          while (TEST_ADDR != LAST_ADDR)
6180
6181          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
6182          until it is equal to ROUNDED_SIZE.  */
6183
6184       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
6185
6186
6187       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
6188          that SIZE is equal to ROUNDED_SIZE.  */
6189
6190       if (size != rounded_size)
6191         {
6192           HOST_WIDE_INT rem = size - rounded_size;
6193
6194           if (rem > 256)
6195             {
6196               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
6197
6198               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
6199               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
6200             }
6201           else
6202             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
6203         }
6204     }
6205
6206   /* Make sure nothing is scheduled before we are done.  */
6207   emit_insn (gen_blockage ());
6208 }
6209
6210 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
6211    absolute addresses.  */
6212
6213 const char *
6214 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
6215 {
6216   static int labelno = 0;
6217   char loop_lab[32];
6218   rtx xops[2];
6219
6220   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
6221
6222   /* Loop.  */
6223   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
6224
6225   HOST_WIDE_INT stack_clash_probe_interval
6226     = 1 << param_stack_clash_protection_guard_size;
6227
6228   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
6229   xops[0] = reg1;
6230   HOST_WIDE_INT interval;
6231   if (flag_stack_clash_protection)
6232     interval = stack_clash_probe_interval;
6233   else
6234     interval = PROBE_INTERVAL;
6235
6236   gcc_assert (aarch64_uimm12_shift (interval));
6237   xops[1] = GEN_INT (interval);
6238
6239   output_asm_insn ("sub\t%0, %0, %1", xops);
6240
6241   /* If doing stack clash protection then we probe up by the ABI specified
6242      amount.  We do this because we're dropping full pages at a time in the
6243      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
6244   if (flag_stack_clash_protection)
6245     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
6246   else
6247     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
6248
6249   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
6250      by this amount for each iteration.  */
6251   output_asm_insn ("str\txzr, [%0, %1]", xops);
6252
6253   /* Test if TEST_ADDR == LAST_ADDR.  */
6254   xops[1] = reg2;
6255   output_asm_insn ("cmp\t%0, %1", xops);
6256
6257   /* Branch.  */
6258   fputs ("\tb.ne\t", asm_out_file);
6259   assemble_name_raw (asm_out_file, loop_lab);
6260   fputc ('\n', asm_out_file);
6261
6262   return "";
6263 }
6264
6265 /* Emit the probe loop for doing stack clash probes and stack adjustments for
6266    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
6267    of GUARD_SIZE.  When a probe is emitted it is done at most
6268    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
6269    at most MIN_PROBE_THRESHOLD.  By the end of this function
6270    BASE = BASE - ADJUSTMENT.  */
6271
6272 const char *
6273 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
6274                                       rtx min_probe_threshold, rtx guard_size)
6275 {
6276   /* This function is not allowed to use any instruction generation function
6277      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
6278      so instead emit the code you want using output_asm_insn.  */
6279   gcc_assert (flag_stack_clash_protection);
6280   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
6281   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
6282
6283   /* The minimum required allocation before the residual requires probing.  */
6284   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
6285
6286   /* Clamp the value down to the nearest value that can be used with a cmp.  */
6287   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
6288   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
6289
6290   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
6291   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
6292
6293   static int labelno = 0;
6294   char loop_start_lab[32];
6295   char loop_end_lab[32];
6296   rtx xops[2];
6297
6298   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
6299   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
6300
6301   /* Emit loop start label.  */
6302   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
6303
6304   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
6305   xops[0] = adjustment;
6306   xops[1] = probe_offset_value_rtx;
6307   output_asm_insn ("cmp\t%0, %1", xops);
6308
6309   /* Branch to end if not enough adjustment to probe.  */
6310   fputs ("\tb.lt\t", asm_out_file);
6311   assemble_name_raw (asm_out_file, loop_end_lab);
6312   fputc ('\n', asm_out_file);
6313
6314   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
6315   xops[0] = base;
6316   xops[1] = probe_offset_value_rtx;
6317   output_asm_insn ("sub\t%0, %0, %1", xops);
6318
6319   /* Probe at BASE.  */
6320   xops[1] = const0_rtx;
6321   output_asm_insn ("str\txzr, [%0, %1]", xops);
6322
6323   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
6324   xops[0] = adjustment;
6325   xops[1] = probe_offset_value_rtx;
6326   output_asm_insn ("sub\t%0, %0, %1", xops);
6327
6328   /* Branch to start if still more bytes to allocate.  */
6329   fputs ("\tb\t", asm_out_file);
6330   assemble_name_raw (asm_out_file, loop_start_lab);
6331   fputc ('\n', asm_out_file);
6332
6333   /* No probe leave.  */
6334   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
6335
6336   /* BASE = BASE - ADJUSTMENT.  */
6337   xops[0] = base;
6338   xops[1] = adjustment;
6339   output_asm_insn ("sub\t%0, %0, %1", xops);
6340   return "";
6341 }
6342
6343 /* Determine whether a frame chain needs to be generated.  */
6344 static bool
6345 aarch64_needs_frame_chain (void)
6346 {
6347   /* Force a frame chain for EH returns so the return address is at FP+8.  */
6348   if (frame_pointer_needed || crtl->calls_eh_return)
6349     return true;
6350
6351   /* A leaf function cannot have calls or write LR.  */
6352   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
6353
6354   /* Don't use a frame chain in leaf functions if leaf frame pointers
6355      are disabled.  */
6356   if (flag_omit_leaf_frame_pointer && is_leaf)
6357     return false;
6358
6359   return aarch64_use_frame_pointer;
6360 }
6361
6362 /* Mark the registers that need to be saved by the callee and calculate
6363    the size of the callee-saved registers area and frame record (both FP
6364    and LR may be omitted).  */
6365 static void
6366 aarch64_layout_frame (void)
6367 {
6368   poly_int64 offset = 0;
6369   int regno, last_fp_reg = INVALID_REGNUM;
6370   machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
6371   poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
6372   bool frame_related_fp_reg_p = false;
6373   aarch64_frame &frame = cfun->machine->frame;
6374
6375   frame.emit_frame_chain = aarch64_needs_frame_chain ();
6376
6377   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
6378      the mid-end is doing.  */
6379   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
6380
6381 #define SLOT_NOT_REQUIRED (-2)
6382 #define SLOT_REQUIRED     (-1)
6383
6384   frame.wb_candidate1 = INVALID_REGNUM;
6385   frame.wb_candidate2 = INVALID_REGNUM;
6386   frame.spare_pred_reg = INVALID_REGNUM;
6387
6388   /* First mark all the registers that really need to be saved...  */
6389   for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6390     frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
6391
6392   /* ... that includes the eh data registers (if needed)...  */
6393   if (crtl->calls_eh_return)
6394     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
6395       frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
6396
6397   /* ... and any callee saved register that dataflow says is live.  */
6398   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
6399     if (df_regs_ever_live_p (regno)
6400         && !fixed_regs[regno]
6401         && (regno == R30_REGNUM
6402             || !crtl->abi->clobbers_full_reg_p (regno)))
6403       frame.reg_offset[regno] = SLOT_REQUIRED;
6404
6405   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6406     if (df_regs_ever_live_p (regno)
6407         && !fixed_regs[regno]
6408         && !crtl->abi->clobbers_full_reg_p (regno))
6409       {
6410         frame.reg_offset[regno] = SLOT_REQUIRED;
6411         last_fp_reg = regno;
6412         if (aarch64_emit_cfi_for_reg_p (regno))
6413           frame_related_fp_reg_p = true;
6414       }
6415
6416   /* Big-endian SVE frames need a spare predicate register in order
6417      to save Z8-Z15.  Decide which register they should use.  Prefer
6418      an unused argument register if possible, so that we don't force P4
6419      to be saved unnecessarily.  */
6420   if (frame_related_fp_reg_p
6421       && crtl->abi->id () == ARM_PCS_SVE
6422       && BYTES_BIG_ENDIAN)
6423     {
6424       bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
6425       bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
6426       for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
6427         if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
6428           break;
6429       gcc_assert (regno <= P7_REGNUM);
6430       frame.spare_pred_reg = regno;
6431       df_set_regs_ever_live (regno, true);
6432     }
6433
6434   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
6435     if (df_regs_ever_live_p (regno)
6436         && !fixed_regs[regno]
6437         && !crtl->abi->clobbers_full_reg_p (regno))
6438       frame.reg_offset[regno] = SLOT_REQUIRED;
6439
6440   /* With stack-clash, LR must be saved in non-leaf functions.  */
6441   gcc_assert (crtl->is_leaf
6442               || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
6443
6444   /* Now assign stack slots for the registers.  Start with the predicate
6445      registers, since predicate LDR and STR have a relatively small
6446      offset range.  These saves happen below the hard frame pointer.  */
6447   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
6448     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6449       {
6450         frame.reg_offset[regno] = offset;
6451         offset += BYTES_PER_SVE_PRED;
6452       }
6453
6454   if (maybe_ne (offset, 0))
6455     {
6456       /* If we have any vector registers to save above the predicate registers,
6457          the offset of the vector register save slots need to be a multiple
6458          of the vector size.  This lets us use the immediate forms of LDR/STR
6459          (or LD1/ST1 for big-endian).
6460
6461          A vector register is 8 times the size of a predicate register,
6462          and we need to save a maximum of 12 predicate registers, so the
6463          first vector register will be at either #1, MUL VL or #2, MUL VL.
6464
6465          If we don't have any vector registers to save, and we know how
6466          big the predicate save area is, we can just round it up to the
6467          next 16-byte boundary.  */
6468       if (last_fp_reg == (int) INVALID_REGNUM && offset.is_constant ())
6469         offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6470       else
6471         {
6472           if (known_le (offset, vector_save_size))
6473             offset = vector_save_size;
6474           else if (known_le (offset, vector_save_size * 2))
6475             offset = vector_save_size * 2;
6476           else
6477             gcc_unreachable ();
6478         }
6479     }
6480
6481   /* If we need to save any SVE vector registers, add them next.  */
6482   if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
6483     for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6484       if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6485         {
6486           frame.reg_offset[regno] = offset;
6487           offset += vector_save_size;
6488         }
6489
6490   /* OFFSET is now the offset of the hard frame pointer from the bottom
6491      of the callee save area.  */
6492   bool saves_below_hard_fp_p = maybe_ne (offset, 0);
6493   frame.below_hard_fp_saved_regs_size = offset;
6494   if (frame.emit_frame_chain)
6495     {
6496       /* FP and LR are placed in the linkage record.  */
6497       frame.reg_offset[R29_REGNUM] = offset;
6498       frame.wb_candidate1 = R29_REGNUM;
6499       frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
6500       frame.wb_candidate2 = R30_REGNUM;
6501       offset += 2 * UNITS_PER_WORD;
6502     }
6503
6504   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
6505     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6506       {
6507         frame.reg_offset[regno] = offset;
6508         if (frame.wb_candidate1 == INVALID_REGNUM)
6509           frame.wb_candidate1 = regno;
6510         else if (frame.wb_candidate2 == INVALID_REGNUM)
6511           frame.wb_candidate2 = regno;
6512         offset += UNITS_PER_WORD;
6513       }
6514
6515   poly_int64 max_int_offset = offset;
6516   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6517   bool has_align_gap = maybe_ne (offset, max_int_offset);
6518
6519   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6520     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6521       {
6522         /* If there is an alignment gap between integer and fp callee-saves,
6523            allocate the last fp register to it if possible.  */
6524         if (regno == last_fp_reg
6525             && has_align_gap
6526             && known_eq (vector_save_size, 8)
6527             && multiple_p (offset, 16))
6528           {
6529             frame.reg_offset[regno] = max_int_offset;
6530             break;
6531           }
6532
6533         frame.reg_offset[regno] = offset;
6534         if (frame.wb_candidate1 == INVALID_REGNUM)
6535           frame.wb_candidate1 = regno;
6536         else if (frame.wb_candidate2 == INVALID_REGNUM
6537                  && frame.wb_candidate1 >= V0_REGNUM)
6538           frame.wb_candidate2 = regno;
6539         offset += vector_save_size;
6540       }
6541
6542   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6543
6544   frame.saved_regs_size = offset;
6545
6546   poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
6547
6548   poly_int64 above_outgoing_args
6549     = aligned_upper_bound (varargs_and_saved_regs_size
6550                            + get_frame_size (),
6551                            STACK_BOUNDARY / BITS_PER_UNIT);
6552
6553   frame.hard_fp_offset
6554     = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
6555
6556   /* Both these values are already aligned.  */
6557   gcc_assert (multiple_p (crtl->outgoing_args_size,
6558                           STACK_BOUNDARY / BITS_PER_UNIT));
6559   frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
6560
6561   frame.locals_offset = frame.saved_varargs_size;
6562
6563   frame.initial_adjust = 0;
6564   frame.final_adjust = 0;
6565   frame.callee_adjust = 0;
6566   frame.sve_callee_adjust = 0;
6567   frame.callee_offset = 0;
6568
6569   HOST_WIDE_INT max_push_offset = 0;
6570   if (frame.wb_candidate2 != INVALID_REGNUM)
6571     max_push_offset = 512;
6572   else if (frame.wb_candidate1 != INVALID_REGNUM)
6573     max_push_offset = 256;
6574
6575   HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
6576   HOST_WIDE_INT const_saved_regs_size;
6577   if (frame.frame_size.is_constant (&const_size)
6578       && const_size < max_push_offset
6579       && known_eq (frame.hard_fp_offset, const_size))
6580     {
6581       /* Simple, small frame with no outgoing arguments:
6582
6583          stp reg1, reg2, [sp, -frame_size]!
6584          stp reg3, reg4, [sp, 16]  */
6585       frame.callee_adjust = const_size;
6586     }
6587   else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
6588            && frame.saved_regs_size.is_constant (&const_saved_regs_size)
6589            && const_outgoing_args_size + const_saved_regs_size < 512
6590            /* We could handle this case even with outgoing args, provided
6591               that the number of args left us with valid offsets for all
6592               predicate and vector save slots.  It's such a rare case that
6593               it hardly seems worth the effort though.  */
6594            && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
6595            && !(cfun->calls_alloca
6596                 && frame.hard_fp_offset.is_constant (&const_fp_offset)
6597                 && const_fp_offset < max_push_offset))
6598     {
6599       /* Frame with small outgoing arguments:
6600
6601          sub sp, sp, frame_size
6602          stp reg1, reg2, [sp, outgoing_args_size]
6603          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
6604       frame.initial_adjust = frame.frame_size;
6605       frame.callee_offset = const_outgoing_args_size;
6606     }
6607   else if (saves_below_hard_fp_p
6608            && known_eq (frame.saved_regs_size,
6609                         frame.below_hard_fp_saved_regs_size))
6610     {
6611       /* Frame in which all saves are SVE saves:
6612
6613          sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
6614          save SVE registers relative to SP
6615          sub sp, sp, outgoing_args_size  */
6616       frame.initial_adjust = (frame.hard_fp_offset
6617                               + frame.below_hard_fp_saved_regs_size);
6618       frame.final_adjust = crtl->outgoing_args_size;
6619     }
6620   else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
6621            && const_fp_offset < max_push_offset)
6622     {
6623       /* Frame with large outgoing arguments or SVE saves, but with
6624          a small local area:
6625
6626          stp reg1, reg2, [sp, -hard_fp_offset]!
6627          stp reg3, reg4, [sp, 16]
6628          [sub sp, sp, below_hard_fp_saved_regs_size]
6629          [save SVE registers relative to SP]
6630          sub sp, sp, outgoing_args_size  */
6631       frame.callee_adjust = const_fp_offset;
6632       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
6633       frame.final_adjust = crtl->outgoing_args_size;
6634     }
6635   else
6636     {
6637       /* Frame with large local area and outgoing arguments or SVE saves,
6638          using frame pointer:
6639
6640          sub sp, sp, hard_fp_offset
6641          stp x29, x30, [sp, 0]
6642          add x29, sp, 0
6643          stp reg3, reg4, [sp, 16]
6644          [sub sp, sp, below_hard_fp_saved_regs_size]
6645          [save SVE registers relative to SP]
6646          sub sp, sp, outgoing_args_size  */
6647       frame.initial_adjust = frame.hard_fp_offset;
6648       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
6649       frame.final_adjust = crtl->outgoing_args_size;
6650     }
6651
6652   /* Make sure the individual adjustments add up to the full frame size.  */
6653   gcc_assert (known_eq (frame.initial_adjust
6654                         + frame.callee_adjust
6655                         + frame.sve_callee_adjust
6656                         + frame.final_adjust, frame.frame_size));
6657
6658   frame.laid_out = true;
6659 }
6660
6661 /* Return true if the register REGNO is saved on entry to
6662    the current function.  */
6663
6664 static bool
6665 aarch64_register_saved_on_entry (int regno)
6666 {
6667   return known_ge (cfun->machine->frame.reg_offset[regno], 0);
6668 }
6669
6670 /* Return the next register up from REGNO up to LIMIT for the callee
6671    to save.  */
6672
6673 static unsigned
6674 aarch64_next_callee_save (unsigned regno, unsigned limit)
6675 {
6676   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
6677     regno ++;
6678   return regno;
6679 }
6680
6681 /* Push the register number REGNO of mode MODE to the stack with write-back
6682    adjusting the stack by ADJUSTMENT.  */
6683
6684 static void
6685 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
6686                            HOST_WIDE_INT adjustment)
6687  {
6688   rtx base_rtx = stack_pointer_rtx;
6689   rtx insn, reg, mem;
6690
6691   reg = gen_rtx_REG (mode, regno);
6692   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
6693                             plus_constant (Pmode, base_rtx, -adjustment));
6694   mem = gen_frame_mem (mode, mem);
6695
6696   insn = emit_move_insn (mem, reg);
6697   RTX_FRAME_RELATED_P (insn) = 1;
6698 }
6699
6700 /* Generate and return an instruction to store the pair of registers
6701    REG and REG2 of mode MODE to location BASE with write-back adjusting
6702    the stack location BASE by ADJUSTMENT.  */
6703
6704 static rtx
6705 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
6706                           HOST_WIDE_INT adjustment)
6707 {
6708   switch (mode)
6709     {
6710     case E_DImode:
6711       return gen_storewb_pairdi_di (base, base, reg, reg2,
6712                                     GEN_INT (-adjustment),
6713                                     GEN_INT (UNITS_PER_WORD - adjustment));
6714     case E_DFmode:
6715       return gen_storewb_pairdf_di (base, base, reg, reg2,
6716                                     GEN_INT (-adjustment),
6717                                     GEN_INT (UNITS_PER_WORD - adjustment));
6718     case E_TFmode:
6719       return gen_storewb_pairtf_di (base, base, reg, reg2,
6720                                     GEN_INT (-adjustment),
6721                                     GEN_INT (UNITS_PER_VREG - adjustment));
6722     default:
6723       gcc_unreachable ();
6724     }
6725 }
6726
6727 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
6728    stack pointer by ADJUSTMENT.  */
6729
6730 static void
6731 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
6732 {
6733   rtx_insn *insn;
6734   machine_mode mode = aarch64_reg_save_mode (regno1);
6735
6736   if (regno2 == INVALID_REGNUM)
6737     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
6738
6739   rtx reg1 = gen_rtx_REG (mode, regno1);
6740   rtx reg2 = gen_rtx_REG (mode, regno2);
6741
6742   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
6743                                               reg2, adjustment));
6744   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
6745   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
6746   RTX_FRAME_RELATED_P (insn) = 1;
6747 }
6748
6749 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
6750    adjusting it by ADJUSTMENT afterwards.  */
6751
6752 static rtx
6753 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
6754                          HOST_WIDE_INT adjustment)
6755 {
6756   switch (mode)
6757     {
6758     case E_DImode:
6759       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
6760                                    GEN_INT (UNITS_PER_WORD));
6761     case E_DFmode:
6762       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
6763                                    GEN_INT (UNITS_PER_WORD));
6764     case E_TFmode:
6765       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
6766                                    GEN_INT (UNITS_PER_VREG));
6767     default:
6768       gcc_unreachable ();
6769     }
6770 }
6771
6772 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
6773    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
6774    into CFI_OPS.  */
6775
6776 static void
6777 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
6778                   rtx *cfi_ops)
6779 {
6780   machine_mode mode = aarch64_reg_save_mode (regno1);
6781   rtx reg1 = gen_rtx_REG (mode, regno1);
6782
6783   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
6784
6785   if (regno2 == INVALID_REGNUM)
6786     {
6787       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
6788       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
6789       emit_move_insn (reg1, gen_frame_mem (mode, mem));
6790     }
6791   else
6792     {
6793       rtx reg2 = gen_rtx_REG (mode, regno2);
6794       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
6795       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
6796                                           reg2, adjustment));
6797     }
6798 }
6799
6800 /* Generate and return a store pair instruction of mode MODE to store
6801    register REG1 to MEM1 and register REG2 to MEM2.  */
6802
6803 static rtx
6804 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
6805                         rtx reg2)
6806 {
6807   switch (mode)
6808     {
6809     case E_DImode:
6810       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
6811
6812     case E_DFmode:
6813       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
6814
6815     case E_TFmode:
6816       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
6817
6818     default:
6819       gcc_unreachable ();
6820     }
6821 }
6822
6823 /* Generate and regurn a load pair isntruction of mode MODE to load register
6824    REG1 from MEM1 and register REG2 from MEM2.  */
6825
6826 static rtx
6827 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
6828                        rtx mem2)
6829 {
6830   switch (mode)
6831     {
6832     case E_DImode:
6833       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
6834
6835     case E_DFmode:
6836       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
6837
6838     case E_TFmode:
6839       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
6840
6841     default:
6842       gcc_unreachable ();
6843     }
6844 }
6845
6846 /* Return TRUE if return address signing should be enabled for the current
6847    function, otherwise return FALSE.  */
6848
6849 bool
6850 aarch64_return_address_signing_enabled (void)
6851 {
6852   /* This function should only be called after frame laid out.   */
6853   gcc_assert (cfun->machine->frame.laid_out);
6854
6855   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
6856      if its LR is pushed onto stack.  */
6857   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
6858           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
6859               && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
6860 }
6861
6862 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
6863 bool
6864 aarch64_bti_enabled (void)
6865 {
6866   return (aarch64_enable_bti == 1);
6867 }
6868
6869 /* The caller is going to use ST1D or LD1D to save or restore an SVE
6870    register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
6871    the range [1, 16] * GET_MODE_SIZE (MODE).  Prepare for this by:
6872
6873      (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
6874          or LD1D address
6875
6876      (2) setting PRED to a valid predicate register for the ST1D or LD1D,
6877          if the variable isn't already nonnull
6878
6879    (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
6880    Handle this case using a temporary base register that is suitable for
6881    all offsets in that range.  Use ANCHOR_REG as this base register if it
6882    is nonnull, otherwise create a new register and store it in ANCHOR_REG.  */
6883
6884 static inline void
6885 aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
6886                                      rtx &anchor_reg, poly_int64 &offset,
6887                                      rtx &ptrue)
6888 {
6889   if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
6890     {
6891       /* This is the maximum valid offset of the anchor from the base.
6892          Lower values would be valid too.  */
6893       poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
6894       if (!anchor_reg)
6895         {
6896           anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6897           emit_insn (gen_add3_insn (anchor_reg, base_rtx,
6898                                     gen_int_mode (anchor_offset, Pmode)));
6899         }
6900       base_rtx = anchor_reg;
6901       offset -= anchor_offset;
6902     }
6903   if (!ptrue)
6904     {
6905       int pred_reg = cfun->machine->frame.spare_pred_reg;
6906       emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
6907                       CONSTM1_RTX (VNx16BImode));
6908       ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
6909     }
6910 }
6911
6912 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6913    is saved at BASE + OFFSET.  */
6914
6915 static void
6916 aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
6917                             rtx base, poly_int64 offset)
6918 {
6919   rtx mem = gen_frame_mem (GET_MODE (reg),
6920                            plus_constant (Pmode, base, offset));
6921   add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
6922 }
6923
6924 /* Emit code to save the callee-saved registers from register number START
6925    to LIMIT to the stack at the location starting at offset START_OFFSET,
6926    skipping any write-back candidates if SKIP_WB is true.  HARD_FP_VALID_P
6927    is true if the hard frame pointer has been set up.  */
6928
6929 static void
6930 aarch64_save_callee_saves (poly_int64 start_offset,
6931                            unsigned start, unsigned limit, bool skip_wb,
6932                            bool hard_fp_valid_p)
6933 {
6934   rtx_insn *insn;
6935   unsigned regno;
6936   unsigned regno2;
6937   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
6938
6939   for (regno = aarch64_next_callee_save (start, limit);
6940        regno <= limit;
6941        regno = aarch64_next_callee_save (regno + 1, limit))
6942     {
6943       rtx reg, mem;
6944       poly_int64 offset;
6945       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
6946
6947       if (skip_wb
6948           && (regno == cfun->machine->frame.wb_candidate1
6949               || regno == cfun->machine->frame.wb_candidate2))
6950         continue;
6951
6952       if (cfun->machine->reg_is_wrapped_separately[regno])
6953         continue;
6954
6955       machine_mode mode = aarch64_reg_save_mode (regno);
6956       reg = gen_rtx_REG (mode, regno);
6957       offset = start_offset + cfun->machine->frame.reg_offset[regno];
6958       rtx base_rtx = stack_pointer_rtx;
6959       poly_int64 sp_offset = offset;
6960
6961       HOST_WIDE_INT const_offset;
6962       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6963         aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
6964                                              offset, ptrue);
6965       else if (GP_REGNUM_P (regno)
6966                && (!offset.is_constant (&const_offset) || const_offset >= 512))
6967         {
6968           gcc_assert (known_eq (start_offset, 0));
6969           poly_int64 fp_offset
6970             = cfun->machine->frame.below_hard_fp_saved_regs_size;
6971           if (hard_fp_valid_p)
6972             base_rtx = hard_frame_pointer_rtx;
6973           else
6974             {
6975               if (!anchor_reg)
6976                 {
6977                   anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6978                   emit_insn (gen_add3_insn (anchor_reg, base_rtx,
6979                                             gen_int_mode (fp_offset, Pmode)));
6980                 }
6981               base_rtx = anchor_reg;
6982             }
6983           offset -= fp_offset;
6984         }
6985       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
6986       bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
6987
6988       if (!aarch64_sve_mode_p (mode)
6989           && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
6990           && !cfun->machine->reg_is_wrapped_separately[regno2]
6991           && known_eq (GET_MODE_SIZE (mode),
6992                        cfun->machine->frame.reg_offset[regno2]
6993                        - cfun->machine->frame.reg_offset[regno]))
6994         {
6995           rtx reg2 = gen_rtx_REG (mode, regno2);
6996           rtx mem2;
6997
6998           offset += GET_MODE_SIZE (mode);
6999           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7000           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
7001                                                     reg2));
7002
7003           /* The first part of a frame-related parallel insn is
7004              always assumed to be relevant to the frame
7005              calculations; subsequent parts, are only
7006              frame-related if explicitly marked.  */
7007           if (aarch64_emit_cfi_for_reg_p (regno2))
7008             {
7009               if (need_cfa_note_p)
7010                 aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx,
7011                                             sp_offset + GET_MODE_SIZE (mode));
7012               else
7013                 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
7014             }
7015
7016           regno = regno2;
7017         }
7018       else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7019         {
7020           insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg));
7021           need_cfa_note_p = true;
7022         }
7023       else if (aarch64_sve_mode_p (mode))
7024         insn = emit_insn (gen_rtx_SET (mem, reg));
7025       else
7026         insn = emit_move_insn (mem, reg);
7027
7028       RTX_FRAME_RELATED_P (insn) = frame_related_p;
7029       if (frame_related_p && need_cfa_note_p)
7030         aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset);
7031     }
7032 }
7033
7034 /* Emit code to restore the callee registers from register number START
7035    up to and including LIMIT.  Restore from the stack offset START_OFFSET,
7036    skipping any write-back candidates if SKIP_WB is true.  Write the
7037    appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
7038
7039 static void
7040 aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
7041                               unsigned limit, bool skip_wb, rtx *cfi_ops)
7042 {
7043   unsigned regno;
7044   unsigned regno2;
7045   poly_int64 offset;
7046   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
7047
7048   for (regno = aarch64_next_callee_save (start, limit);
7049        regno <= limit;
7050        regno = aarch64_next_callee_save (regno + 1, limit))
7051     {
7052       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
7053       if (cfun->machine->reg_is_wrapped_separately[regno])
7054         continue;
7055
7056       rtx reg, mem;
7057
7058       if (skip_wb
7059           && (regno == cfun->machine->frame.wb_candidate1
7060               || regno == cfun->machine->frame.wb_candidate2))
7061         continue;
7062
7063       machine_mode mode = aarch64_reg_save_mode (regno);
7064       reg = gen_rtx_REG (mode, regno);
7065       offset = start_offset + cfun->machine->frame.reg_offset[regno];
7066       rtx base_rtx = stack_pointer_rtx;
7067       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7068         aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
7069                                              offset, ptrue);
7070       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7071
7072       if (!aarch64_sve_mode_p (mode)
7073           && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
7074           && !cfun->machine->reg_is_wrapped_separately[regno2]
7075           && known_eq (GET_MODE_SIZE (mode),
7076                        cfun->machine->frame.reg_offset[regno2]
7077                        - cfun->machine->frame.reg_offset[regno]))
7078         {
7079           rtx reg2 = gen_rtx_REG (mode, regno2);
7080           rtx mem2;
7081
7082           offset += GET_MODE_SIZE (mode);
7083           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7084           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
7085
7086           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
7087           regno = regno2;
7088         }
7089       else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7090         emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
7091       else if (aarch64_sve_mode_p (mode))
7092         emit_insn (gen_rtx_SET (reg, mem));
7093       else
7094         emit_move_insn (reg, mem);
7095       if (frame_related_p)
7096         *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
7097     }
7098 }
7099
7100 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
7101    of MODE.  */
7102
7103 static inline bool
7104 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
7105 {
7106   HOST_WIDE_INT multiple;
7107   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7108           && IN_RANGE (multiple, -8, 7));
7109 }
7110
7111 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
7112    of MODE.  */
7113
7114 static inline bool
7115 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
7116 {
7117   HOST_WIDE_INT multiple;
7118   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7119           && IN_RANGE (multiple, 0, 63));
7120 }
7121
7122 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
7123    of MODE.  */
7124
7125 bool
7126 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
7127 {
7128   HOST_WIDE_INT multiple;
7129   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7130           && IN_RANGE (multiple, -64, 63));
7131 }
7132
7133 /* Return true if OFFSET is a signed 9-bit value.  */
7134
7135 bool
7136 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
7137                                        poly_int64 offset)
7138 {
7139   HOST_WIDE_INT const_offset;
7140   return (offset.is_constant (&const_offset)
7141           && IN_RANGE (const_offset, -256, 255));
7142 }
7143
7144 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
7145    of MODE.  */
7146
7147 static inline bool
7148 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
7149 {
7150   HOST_WIDE_INT multiple;
7151   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7152           && IN_RANGE (multiple, -256, 255));
7153 }
7154
7155 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
7156    of MODE.  */
7157
7158 static inline bool
7159 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
7160 {
7161   HOST_WIDE_INT multiple;
7162   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7163           && IN_RANGE (multiple, 0, 4095));
7164 }
7165
7166 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
7167
7168 static sbitmap
7169 aarch64_get_separate_components (void)
7170 {
7171   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
7172   bitmap_clear (components);
7173
7174   /* The registers we need saved to the frame.  */
7175   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7176     if (aarch64_register_saved_on_entry (regno))
7177       {
7178         /* Punt on saves and restores that use ST1D and LD1D.  We could
7179            try to be smarter, but it would involve making sure that the
7180            spare predicate register itself is safe to use at the save
7181            and restore points.  Also, when a frame pointer is being used,
7182            the slots are often out of reach of ST1D and LD1D anyway.  */
7183         machine_mode mode = aarch64_reg_save_mode (regno);
7184         if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7185           continue;
7186
7187         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
7188
7189         /* If the register is saved in the first SVE save slot, we use
7190            it as a stack probe for -fstack-clash-protection.  */
7191         if (flag_stack_clash_protection
7192             && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
7193             && known_eq (offset, 0))
7194           continue;
7195
7196         /* Get the offset relative to the register we'll use.  */
7197         if (frame_pointer_needed)
7198           offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
7199         else
7200           offset += crtl->outgoing_args_size;
7201
7202         /* Check that we can access the stack slot of the register with one
7203            direct load with no adjustments needed.  */
7204         if (aarch64_sve_mode_p (mode)
7205             ? offset_9bit_signed_scaled_p (mode, offset)
7206             : offset_12bit_unsigned_scaled_p (mode, offset))
7207           bitmap_set_bit (components, regno);
7208       }
7209
7210   /* Don't mess with the hard frame pointer.  */
7211   if (frame_pointer_needed)
7212     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
7213
7214   /* If the spare predicate register used by big-endian SVE code
7215      is call-preserved, it must be saved in the main prologue
7216      before any saves that use it.  */
7217   if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
7218     bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
7219
7220   unsigned reg1 = cfun->machine->frame.wb_candidate1;
7221   unsigned reg2 = cfun->machine->frame.wb_candidate2;
7222   /* If registers have been chosen to be stored/restored with
7223      writeback don't interfere with them to avoid having to output explicit
7224      stack adjustment instructions.  */
7225   if (reg2 != INVALID_REGNUM)
7226     bitmap_clear_bit (components, reg2);
7227   if (reg1 != INVALID_REGNUM)
7228     bitmap_clear_bit (components, reg1);
7229
7230   bitmap_clear_bit (components, LR_REGNUM);
7231   bitmap_clear_bit (components, SP_REGNUM);
7232
7233   return components;
7234 }
7235
7236 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
7237
7238 static sbitmap
7239 aarch64_components_for_bb (basic_block bb)
7240 {
7241   bitmap in = DF_LIVE_IN (bb);
7242   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
7243   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
7244
7245   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
7246   bitmap_clear (components);
7247
7248   /* Clobbered registers don't generate values in any meaningful sense,
7249      since nothing after the clobber can rely on their value.  And we can't
7250      say that partially-clobbered registers are unconditionally killed,
7251      because whether they're killed or not depends on the mode of the
7252      value they're holding.  Thus partially call-clobbered registers
7253      appear in neither the kill set nor the gen set.
7254
7255      Check manually for any calls that clobber more of a register than the
7256      current function can.  */
7257   function_abi_aggregator callee_abis;
7258   rtx_insn *insn;
7259   FOR_BB_INSNS (bb, insn)
7260     if (CALL_P (insn))
7261       callee_abis.note_callee_abi (insn_callee_abi (insn));
7262   HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
7263
7264   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
7265   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7266     if (!fixed_regs[regno]
7267         && !crtl->abi->clobbers_full_reg_p (regno)
7268         && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
7269             || bitmap_bit_p (in, regno)
7270             || bitmap_bit_p (gen, regno)
7271             || bitmap_bit_p (kill, regno)))
7272       {
7273         bitmap_set_bit (components, regno);
7274
7275         /* If there is a callee-save at an adjacent offset, add it too
7276            to increase the use of LDP/STP.  */
7277         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
7278         unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
7279
7280         if (regno2 <= LAST_SAVED_REGNUM)
7281           {
7282             poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
7283             if (regno < regno2
7284                 ? known_eq (offset + 8, offset2)
7285                 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
7286               bitmap_set_bit (components, regno2);
7287           }
7288       }
7289
7290   return components;
7291 }
7292
7293 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
7294    Nothing to do for aarch64.  */
7295
7296 static void
7297 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
7298 {
7299 }
7300
7301 /* Return the next set bit in BMP from START onwards.  Return the total number
7302    of bits in BMP if no set bit is found at or after START.  */
7303
7304 static unsigned int
7305 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
7306 {
7307   unsigned int nbits = SBITMAP_SIZE (bmp);
7308   if (start == nbits)
7309     return start;
7310
7311   gcc_assert (start < nbits);
7312   for (unsigned int i = start; i < nbits; i++)
7313     if (bitmap_bit_p (bmp, i))
7314       return i;
7315
7316   return nbits;
7317 }
7318
7319 /* Do the work for aarch64_emit_prologue_components and
7320    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
7321    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
7322    for these components or the epilogue sequence.  That is, it determines
7323    whether we should emit stores or loads and what kind of CFA notes to attach
7324    to the insns.  Otherwise the logic for the two sequences is very
7325    similar.  */
7326
7327 static void
7328 aarch64_process_components (sbitmap components, bool prologue_p)
7329 {
7330   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
7331                              ? HARD_FRAME_POINTER_REGNUM
7332                              : STACK_POINTER_REGNUM);
7333
7334   unsigned last_regno = SBITMAP_SIZE (components);
7335   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
7336   rtx_insn *insn = NULL;
7337
7338   while (regno != last_regno)
7339     {
7340       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
7341       machine_mode mode = aarch64_reg_save_mode (regno);
7342
7343       rtx reg = gen_rtx_REG (mode, regno);
7344       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
7345       if (frame_pointer_needed)
7346         offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
7347       else
7348         offset += crtl->outgoing_args_size;
7349
7350       rtx addr = plus_constant (Pmode, ptr_reg, offset);
7351       rtx mem = gen_frame_mem (mode, addr);
7352
7353       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
7354       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
7355       /* No more registers to handle after REGNO.
7356          Emit a single save/restore and exit.  */
7357       if (regno2 == last_regno)
7358         {
7359           insn = emit_insn (set);
7360           if (frame_related_p)
7361             {
7362               RTX_FRAME_RELATED_P (insn) = 1;
7363               if (prologue_p)
7364                 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
7365               else
7366                 add_reg_note (insn, REG_CFA_RESTORE, reg);
7367             }
7368           break;
7369         }
7370
7371       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
7372       /* The next register is not of the same class or its offset is not
7373          mergeable with the current one into a pair.  */
7374       if (aarch64_sve_mode_p (mode)
7375           || !satisfies_constraint_Ump (mem)
7376           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
7377           || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
7378           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
7379                        GET_MODE_SIZE (mode)))
7380         {
7381           insn = emit_insn (set);
7382           if (frame_related_p)
7383             {
7384               RTX_FRAME_RELATED_P (insn) = 1;
7385               if (prologue_p)
7386                 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
7387               else
7388                 add_reg_note (insn, REG_CFA_RESTORE, reg);
7389             }
7390
7391           regno = regno2;
7392           continue;
7393         }
7394
7395       bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
7396
7397       /* REGNO2 can be saved/restored in a pair with REGNO.  */
7398       rtx reg2 = gen_rtx_REG (mode, regno2);
7399       if (frame_pointer_needed)
7400         offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
7401       else
7402         offset2 += crtl->outgoing_args_size;
7403       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
7404       rtx mem2 = gen_frame_mem (mode, addr2);
7405       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
7406                              : gen_rtx_SET (reg2, mem2);
7407
7408       if (prologue_p)
7409         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
7410       else
7411         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
7412
7413       if (frame_related_p || frame_related2_p)
7414         {
7415           RTX_FRAME_RELATED_P (insn) = 1;
7416           if (prologue_p)
7417             {
7418               if (frame_related_p)
7419                 add_reg_note (insn, REG_CFA_OFFSET, set);
7420               if (frame_related2_p)
7421                 add_reg_note (insn, REG_CFA_OFFSET, set2);
7422             }
7423           else
7424             {
7425               if (frame_related_p)
7426                 add_reg_note (insn, REG_CFA_RESTORE, reg);
7427               if (frame_related2_p)
7428                 add_reg_note (insn, REG_CFA_RESTORE, reg2);
7429             }
7430         }
7431
7432       regno = aarch64_get_next_set_bit (components, regno2 + 1);
7433     }
7434 }
7435
7436 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
7437
7438 static void
7439 aarch64_emit_prologue_components (sbitmap components)
7440 {
7441   aarch64_process_components (components, true);
7442 }
7443
7444 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
7445
7446 static void
7447 aarch64_emit_epilogue_components (sbitmap components)
7448 {
7449   aarch64_process_components (components, false);
7450 }
7451
7452 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
7453
7454 static void
7455 aarch64_set_handled_components (sbitmap components)
7456 {
7457   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7458     if (bitmap_bit_p (components, regno))
7459       cfun->machine->reg_is_wrapped_separately[regno] = true;
7460 }
7461
7462 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
7463    determining the probe offset for alloca.  */
7464
7465 static HOST_WIDE_INT
7466 aarch64_stack_clash_protection_alloca_probe_range (void)
7467 {
7468   return STACK_CLASH_CALLER_GUARD;
7469 }
7470
7471
7472 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
7473    registers.  If POLY_SIZE is not large enough to require a probe this function
7474    will only adjust the stack.  When allocating the stack space
7475    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
7476    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
7477    arguments.  If we are then we ensure that any allocation larger than the ABI
7478    defined buffer needs a probe so that the invariant of having a 1KB buffer is
7479    maintained.
7480
7481    We emit barriers after each stack adjustment to prevent optimizations from
7482    breaking the invariant that we never drop the stack more than a page.  This
7483    invariant is needed to make it easier to correctly handle asynchronous
7484    events, e.g. if we were to allow the stack to be dropped by more than a page
7485    and then have multiple probes up and we take a signal somewhere in between
7486    then the signal handler doesn't know the state of the stack and can make no
7487    assumptions about which pages have been probed.  */
7488
7489 static void
7490 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
7491                                         poly_int64 poly_size,
7492                                         bool frame_related_p,
7493                                         bool final_adjustment_p)
7494 {
7495   HOST_WIDE_INT guard_size
7496     = 1 << param_stack_clash_protection_guard_size;
7497   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
7498   HOST_WIDE_INT min_probe_threshold
7499     = (final_adjustment_p
7500        ? guard_used_by_caller
7501        : guard_size - guard_used_by_caller);
7502   /* When doing the final adjustment for the outgoing arguments, take into
7503      account any unprobed space there is above the current SP.  There are
7504      two cases:
7505
7506      - When saving SVE registers below the hard frame pointer, we force
7507        the lowest save to take place in the prologue before doing the final
7508        adjustment (i.e. we don't allow the save to be shrink-wrapped).
7509        This acts as a probe at SP, so there is no unprobed space.
7510
7511      - When there are no SVE register saves, we use the store of the link
7512        register as a probe.  We can't assume that LR was saved at position 0
7513        though, so treat any space below it as unprobed.  */
7514   if (final_adjustment_p
7515       && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
7516     {
7517       poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
7518       if (known_ge (lr_offset, 0))
7519         min_probe_threshold -= lr_offset.to_constant ();
7520       else
7521         gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
7522     }
7523
7524   poly_int64 frame_size = cfun->machine->frame.frame_size;
7525
7526   /* We should always have a positive probe threshold.  */
7527   gcc_assert (min_probe_threshold > 0);
7528
7529   if (flag_stack_clash_protection && !final_adjustment_p)
7530     {
7531       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
7532       poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
7533       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7534
7535       if (known_eq (frame_size, 0))
7536         {
7537           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
7538         }
7539       else if (known_lt (initial_adjust + sve_callee_adjust,
7540                          guard_size - guard_used_by_caller)
7541                && known_lt (final_adjust, guard_used_by_caller))
7542         {
7543           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
7544         }
7545     }
7546
7547   /* If SIZE is not large enough to require probing, just adjust the stack and
7548      exit.  */
7549   if (known_lt (poly_size, min_probe_threshold)
7550       || !flag_stack_clash_protection)
7551     {
7552       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
7553       return;
7554     }
7555
7556   HOST_WIDE_INT size;
7557   /* Handle the SVE non-constant case first.  */
7558   if (!poly_size.is_constant (&size))
7559     {
7560      if (dump_file)
7561       {
7562         fprintf (dump_file, "Stack clash SVE prologue: ");
7563         print_dec (poly_size, dump_file);
7564         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
7565       }
7566
7567       /* First calculate the amount of bytes we're actually spilling.  */
7568       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
7569                           poly_size, temp1, temp2, false, true);
7570
7571       rtx_insn *insn = get_last_insn ();
7572
7573       if (frame_related_p)
7574         {
7575           /* This is done to provide unwinding information for the stack
7576              adjustments we're about to do, however to prevent the optimizers
7577              from removing the R11 move and leaving the CFA note (which would be
7578              very wrong) we tie the old and new stack pointer together.
7579              The tie will expand to nothing but the optimizers will not touch
7580              the instruction.  */
7581           rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
7582           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
7583           emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
7584
7585           /* We want the CFA independent of the stack pointer for the
7586              duration of the loop.  */
7587           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
7588           RTX_FRAME_RELATED_P (insn) = 1;
7589         }
7590
7591       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
7592       rtx guard_const = gen_int_mode (guard_size, Pmode);
7593
7594       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
7595                                                    stack_pointer_rtx, temp1,
7596                                                    probe_const, guard_const));
7597
7598       /* Now reset the CFA register if needed.  */
7599       if (frame_related_p)
7600         {
7601           add_reg_note (insn, REG_CFA_DEF_CFA,
7602                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
7603                                       gen_int_mode (poly_size, Pmode)));
7604           RTX_FRAME_RELATED_P (insn) = 1;
7605         }
7606
7607       return;
7608     }
7609
7610   if (dump_file)
7611     fprintf (dump_file,
7612              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
7613              " bytes, probing will be required.\n", size);
7614
7615   /* Round size to the nearest multiple of guard_size, and calculate the
7616      residual as the difference between the original size and the rounded
7617      size.  */
7618   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
7619   HOST_WIDE_INT residual = size - rounded_size;
7620
7621   /* We can handle a small number of allocations/probes inline.  Otherwise
7622      punt to a loop.  */
7623   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
7624     {
7625       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
7626         {
7627           aarch64_sub_sp (NULL, temp2, guard_size, true);
7628           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
7629                                            guard_used_by_caller));
7630           emit_insn (gen_blockage ());
7631         }
7632       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
7633     }
7634   else
7635     {
7636       /* Compute the ending address.  */
7637       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
7638                           temp1, NULL, false, true);
7639       rtx_insn *insn = get_last_insn ();
7640
7641       /* For the initial allocation, we don't have a frame pointer
7642          set up, so we always need CFI notes.  If we're doing the
7643          final allocation, then we may have a frame pointer, in which
7644          case it is the CFA, otherwise we need CFI notes.
7645
7646          We can determine which allocation we are doing by looking at
7647          the value of FRAME_RELATED_P since the final allocations are not
7648          frame related.  */
7649       if (frame_related_p)
7650         {
7651           /* We want the CFA independent of the stack pointer for the
7652              duration of the loop.  */
7653           add_reg_note (insn, REG_CFA_DEF_CFA,
7654                         plus_constant (Pmode, temp1, rounded_size));
7655           RTX_FRAME_RELATED_P (insn) = 1;
7656         }
7657
7658       /* This allocates and probes the stack.  Note that this re-uses some of
7659          the existing Ada stack protection code.  However we are guaranteed not
7660          to enter the non loop or residual branches of that code.
7661
7662          The non-loop part won't be entered because if our allocation amount
7663          doesn't require a loop, the case above would handle it.
7664
7665          The residual amount won't be entered because TEMP1 is a mutliple of
7666          the allocation size.  The residual will always be 0.  As such, the only
7667          part we are actually using from that code is the loop setup.  The
7668          actual probing is done in aarch64_output_probe_stack_range.  */
7669       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
7670                                                stack_pointer_rtx, temp1));
7671
7672       /* Now reset the CFA register if needed.  */
7673       if (frame_related_p)
7674         {
7675           add_reg_note (insn, REG_CFA_DEF_CFA,
7676                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
7677           RTX_FRAME_RELATED_P (insn) = 1;
7678         }
7679
7680       emit_insn (gen_blockage ());
7681       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
7682     }
7683
7684   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
7685      be probed.  This maintains the requirement that each page is probed at
7686      least once.  For initial probing we probe only if the allocation is
7687      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
7688      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
7689      GUARD_SIZE.  This works that for any allocation that is large enough to
7690      trigger a probe here, we'll have at least one, and if they're not large
7691      enough for this code to emit anything for them, The page would have been
7692      probed by the saving of FP/LR either by this function or any callees.  If
7693      we don't have any callees then we won't have more stack adjustments and so
7694      are still safe.  */
7695   if (residual)
7696     {
7697       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
7698       /* If we're doing final adjustments, and we've done any full page
7699          allocations then any residual needs to be probed.  */
7700       if (final_adjustment_p && rounded_size != 0)
7701         min_probe_threshold = 0;
7702       /* If doing a small final adjustment, we always probe at offset 0.
7703          This is done to avoid issues when LR is not at position 0 or when
7704          the final adjustment is smaller than the probing offset.  */
7705       else if (final_adjustment_p && rounded_size == 0)
7706         residual_probe_offset = 0;
7707
7708       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
7709       if (residual >= min_probe_threshold)
7710         {
7711           if (dump_file)
7712             fprintf (dump_file,
7713                      "Stack clash AArch64 prologue residuals: "
7714                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
7715                      "\n", residual);
7716
7717             emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
7718                                              residual_probe_offset));
7719           emit_insn (gen_blockage ());
7720         }
7721     }
7722 }
7723
7724 /* Return 1 if the register is used by the epilogue.  We need to say the
7725    return register is used, but only after epilogue generation is complete.
7726    Note that in the case of sibcalls, the values "used by the epilogue" are
7727    considered live at the start of the called function.
7728
7729    For SIMD functions we need to return 1 for FP registers that are saved and
7730    restored by a function but are not zero in call_used_regs.  If we do not do
7731    this optimizations may remove the restore of the register.  */
7732
7733 int
7734 aarch64_epilogue_uses (int regno)
7735 {
7736   if (epilogue_completed)
7737     {
7738       if (regno == LR_REGNUM)
7739         return 1;
7740     }
7741   return 0;
7742 }
7743
7744 /* AArch64 stack frames generated by this compiler look like:
7745
7746         +-------------------------------+
7747         |                               |
7748         |  incoming stack arguments     |
7749         |                               |
7750         +-------------------------------+
7751         |                               | <-- incoming stack pointer (aligned)
7752         |  callee-allocated save area   |
7753         |  for register varargs         |
7754         |                               |
7755         +-------------------------------+
7756         |  local variables              | <-- frame_pointer_rtx
7757         |                               |
7758         +-------------------------------+
7759         |  padding                      | \
7760         +-------------------------------+  |
7761         |  callee-saved registers       |  | frame.saved_regs_size
7762         +-------------------------------+  |
7763         |  LR'                          |  |
7764         +-------------------------------+  |
7765         |  FP'                          |  |
7766         +-------------------------------+  |<- hard_frame_pointer_rtx (aligned)
7767         |  SVE vector registers         |  | \
7768         +-------------------------------+  |  | below_hard_fp_saved_regs_size
7769         |  SVE predicate registers      | /  /
7770         +-------------------------------+
7771         |  dynamic allocation           |
7772         +-------------------------------+
7773         |  padding                      |
7774         +-------------------------------+
7775         |  outgoing stack arguments     | <-- arg_pointer
7776         |                               |
7777         +-------------------------------+
7778         |                               | <-- stack_pointer_rtx (aligned)
7779
7780    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
7781    but leave frame_pointer_rtx and hard_frame_pointer_rtx
7782    unchanged.
7783
7784    By default for stack-clash we assume the guard is at least 64KB, but this
7785    value is configurable to either 4KB or 64KB.  We also force the guard size to
7786    be the same as the probing interval and both values are kept in sync.
7787
7788    With those assumptions the callee can allocate up to 63KB (or 3KB depending
7789    on the guard size) of stack space without probing.
7790
7791    When probing is needed, we emit a probe at the start of the prologue
7792    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
7793
7794    We have to track how much space has been allocated and the only stores
7795    to the stack we track as implicit probes are the FP/LR stores.
7796
7797    For outgoing arguments we probe if the size is larger than 1KB, such that
7798    the ABI specified buffer is maintained for the next callee.
7799
7800    The following registers are reserved during frame layout and should not be
7801    used for any other purpose:
7802
7803    - r11: Used by stack clash protection when SVE is enabled, and also
7804           as an anchor register when saving and restoring registers
7805    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
7806    - r14 and r15: Used for speculation tracking.
7807    - r16(IP0), r17(IP1): Used by indirect tailcalls.
7808    - r30(LR), r29(FP): Used by standard frame layout.
7809
7810    These registers must be avoided in frame layout related code unless the
7811    explicit intention is to interact with one of the features listed above.  */
7812
7813 /* Generate the prologue instructions for entry into a function.
7814    Establish the stack frame by decreasing the stack pointer with a
7815    properly calculated size and, if necessary, create a frame record
7816    filled with the values of LR and previous frame pointer.  The
7817    current FP is also set up if it is in use.  */
7818
7819 void
7820 aarch64_expand_prologue (void)
7821 {
7822   poly_int64 frame_size = cfun->machine->frame.frame_size;
7823   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
7824   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
7825   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7826   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
7827   poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
7828   poly_int64 below_hard_fp_saved_regs_size
7829     = cfun->machine->frame.below_hard_fp_saved_regs_size;
7830   unsigned reg1 = cfun->machine->frame.wb_candidate1;
7831   unsigned reg2 = cfun->machine->frame.wb_candidate2;
7832   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
7833   rtx_insn *insn;
7834
7835   if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
7836     {
7837       /* Fold the SVE allocation into the initial allocation.
7838          We don't do this in aarch64_layout_arg to avoid pessimizing
7839          the epilogue code.  */
7840       initial_adjust += sve_callee_adjust;
7841       sve_callee_adjust = 0;
7842     }
7843
7844   /* Sign return address for functions.  */
7845   if (aarch64_return_address_signing_enabled ())
7846     {
7847       switch (aarch64_ra_sign_key)
7848         {
7849           case AARCH64_KEY_A:
7850             insn = emit_insn (gen_paciasp ());
7851             break;
7852           case AARCH64_KEY_B:
7853             insn = emit_insn (gen_pacibsp ());
7854             break;
7855           default:
7856             gcc_unreachable ();
7857         }
7858       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
7859       RTX_FRAME_RELATED_P (insn) = 1;
7860     }
7861
7862   if (flag_stack_usage_info)
7863     current_function_static_stack_size = constant_lower_bound (frame_size);
7864
7865   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
7866     {
7867       if (crtl->is_leaf && !cfun->calls_alloca)
7868         {
7869           if (maybe_gt (frame_size, PROBE_INTERVAL)
7870               && maybe_gt (frame_size, get_stack_check_protect ()))
7871             aarch64_emit_probe_stack_range (get_stack_check_protect (),
7872                                             (frame_size
7873                                              - get_stack_check_protect ()));
7874         }
7875       else if (maybe_gt (frame_size, 0))
7876         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
7877     }
7878
7879   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
7880   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
7881
7882   /* In theory we should never have both an initial adjustment
7883      and a callee save adjustment.  Verify that is the case since the
7884      code below does not handle it for -fstack-clash-protection.  */
7885   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
7886
7887   /* Will only probe if the initial adjustment is larger than the guard
7888      less the amount of the guard reserved for use by the caller's
7889      outgoing args.  */
7890   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
7891                                           true, false);
7892
7893   if (callee_adjust != 0)
7894     aarch64_push_regs (reg1, reg2, callee_adjust);
7895
7896   /* The offset of the frame chain record (if any) from the current SP.  */
7897   poly_int64 chain_offset = (initial_adjust + callee_adjust
7898                              - cfun->machine->frame.hard_fp_offset);
7899   gcc_assert (known_ge (chain_offset, 0));
7900
7901   /* The offset of the bottom of the save area from the current SP.  */
7902   poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
7903
7904   if (emit_frame_chain)
7905     {
7906       if (callee_adjust == 0)
7907         {
7908           reg1 = R29_REGNUM;
7909           reg2 = R30_REGNUM;
7910           aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
7911                                      false, false);
7912         }
7913       else
7914         gcc_assert (known_eq (chain_offset, 0));
7915       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
7916                           stack_pointer_rtx, chain_offset,
7917                           tmp1_rtx, tmp0_rtx, frame_pointer_needed);
7918       if (frame_pointer_needed && !frame_size.is_constant ())
7919         {
7920           /* Variable-sized frames need to describe the save slot
7921              address using DW_CFA_expression rather than DW_CFA_offset.
7922              This means that, without taking further action, the
7923              locations of the registers that we've already saved would
7924              remain based on the stack pointer even after we redefine
7925              the CFA based on the frame pointer.  We therefore need new
7926              DW_CFA_expressions to re-express the save slots with addresses
7927              based on the frame pointer.  */
7928           rtx_insn *insn = get_last_insn ();
7929           gcc_assert (RTX_FRAME_RELATED_P (insn));
7930
7931           /* Add an explicit CFA definition if this was previously
7932              implicit.  */
7933           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
7934             {
7935               rtx src = plus_constant (Pmode, stack_pointer_rtx,
7936                                        callee_offset);
7937               add_reg_note (insn, REG_CFA_ADJUST_CFA,
7938                             gen_rtx_SET (hard_frame_pointer_rtx, src));
7939             }
7940
7941           /* Change the save slot expressions for the registers that
7942              we've already saved.  */
7943           aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
7944                                       hard_frame_pointer_rtx, UNITS_PER_WORD);
7945           aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
7946                                       hard_frame_pointer_rtx, 0);
7947         }
7948       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
7949     }
7950
7951   aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
7952                              callee_adjust != 0 || emit_frame_chain,
7953                              emit_frame_chain);
7954   if (maybe_ne (sve_callee_adjust, 0))
7955     {
7956       gcc_assert (!flag_stack_clash_protection
7957                   || known_eq (initial_adjust, 0));
7958       aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
7959                                               sve_callee_adjust,
7960                                               !frame_pointer_needed, false);
7961       saved_regs_offset += sve_callee_adjust;
7962     }
7963   aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
7964                              false, emit_frame_chain);
7965   aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
7966                              callee_adjust != 0 || emit_frame_chain,
7967                              emit_frame_chain);
7968
7969   /* We may need to probe the final adjustment if it is larger than the guard
7970      that is assumed by the called.  */
7971   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
7972                                           !frame_pointer_needed, true);
7973 }
7974
7975 /* Return TRUE if we can use a simple_return insn.
7976
7977    This function checks whether the callee saved stack is empty, which
7978    means no restore actions are need. The pro_and_epilogue will use
7979    this to check whether shrink-wrapping opt is feasible.  */
7980
7981 bool
7982 aarch64_use_return_insn_p (void)
7983 {
7984   if (!reload_completed)
7985     return false;
7986
7987   if (crtl->profile)
7988     return false;
7989
7990   return known_eq (cfun->machine->frame.frame_size, 0);
7991 }
7992
7993 /* Generate the epilogue instructions for returning from a function.
7994    This is almost exactly the reverse of the prolog sequence, except
7995    that we need to insert barriers to avoid scheduling loads that read
7996    from a deallocated stack, and we optimize the unwind records by
7997    emitting them all together if possible.  */
7998 void
7999 aarch64_expand_epilogue (bool for_sibcall)
8000 {
8001   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
8002   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
8003   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
8004   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
8005   poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
8006   poly_int64 below_hard_fp_saved_regs_size
8007     = cfun->machine->frame.below_hard_fp_saved_regs_size;
8008   unsigned reg1 = cfun->machine->frame.wb_candidate1;
8009   unsigned reg2 = cfun->machine->frame.wb_candidate2;
8010   rtx cfi_ops = NULL;
8011   rtx_insn *insn;
8012   /* A stack clash protection prologue may not have left EP0_REGNUM or
8013      EP1_REGNUM in a usable state.  The same is true for allocations
8014      with an SVE component, since we then need both temporary registers
8015      for each allocation.  For stack clash we are in a usable state if
8016      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
8017   HOST_WIDE_INT guard_size
8018     = 1 << param_stack_clash_protection_guard_size;
8019   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
8020
8021   /* We can re-use the registers when:
8022
8023      (a) the deallocation amount is the same as the corresponding
8024          allocation amount (which is false if we combine the initial
8025          and SVE callee save allocations in the prologue); and
8026
8027      (b) the allocation amount doesn't need a probe (which is false
8028          if the amount is guard_size - guard_used_by_caller or greater).
8029
8030      In such situations the register should remain live with the correct
8031      value.  */
8032   bool can_inherit_p = (initial_adjust.is_constant ()
8033                         && final_adjust.is_constant ()
8034                         && (!flag_stack_clash_protection
8035                             || (known_lt (initial_adjust,
8036                                           guard_size - guard_used_by_caller)
8037                                 && known_eq (sve_callee_adjust, 0))));
8038
8039   /* We need to add memory barrier to prevent read from deallocated stack.  */
8040   bool need_barrier_p
8041     = maybe_ne (get_frame_size ()
8042                 + cfun->machine->frame.saved_varargs_size, 0);
8043
8044   /* Emit a barrier to prevent loads from a deallocated stack.  */
8045   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
8046       || cfun->calls_alloca
8047       || crtl->calls_eh_return)
8048     {
8049       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
8050       need_barrier_p = false;
8051     }
8052
8053   /* Restore the stack pointer from the frame pointer if it may not
8054      be the same as the stack pointer.  */
8055   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
8056   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
8057   if (frame_pointer_needed
8058       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
8059     /* If writeback is used when restoring callee-saves, the CFA
8060        is restored on the instruction doing the writeback.  */
8061     aarch64_add_offset (Pmode, stack_pointer_rtx,
8062                         hard_frame_pointer_rtx,
8063                         -callee_offset - below_hard_fp_saved_regs_size,
8064                         tmp1_rtx, tmp0_rtx, callee_adjust == 0);
8065   else
8066      /* The case where we need to re-use the register here is very rare, so
8067         avoid the complicated condition and just always emit a move if the
8068         immediate doesn't fit.  */
8069      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
8070
8071   /* Restore the vector registers before the predicate registers,
8072      so that we can use P4 as a temporary for big-endian SVE frames.  */
8073   aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
8074                                 callee_adjust != 0, &cfi_ops);
8075   aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
8076                                 false, &cfi_ops);
8077   if (maybe_ne (sve_callee_adjust, 0))
8078     aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
8079   aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
8080                                 R0_REGNUM, R30_REGNUM,
8081                                 callee_adjust != 0, &cfi_ops);
8082
8083   if (need_barrier_p)
8084     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
8085
8086   if (callee_adjust != 0)
8087     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
8088
8089   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
8090     {
8091       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
8092       insn = get_last_insn ();
8093       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
8094       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
8095       RTX_FRAME_RELATED_P (insn) = 1;
8096       cfi_ops = NULL;
8097     }
8098
8099   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
8100      add restriction on emit_move optimization to leaf functions.  */
8101   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
8102                   (!can_inherit_p || !crtl->is_leaf
8103                    || df_regs_ever_live_p (EP0_REGNUM)));
8104
8105   if (cfi_ops)
8106     {
8107       /* Emit delayed restores and reset the CFA to be SP.  */
8108       insn = get_last_insn ();
8109       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
8110       REG_NOTES (insn) = cfi_ops;
8111       RTX_FRAME_RELATED_P (insn) = 1;
8112     }
8113
8114   /* We prefer to emit the combined return/authenticate instruction RETAA,
8115      however there are three cases in which we must instead emit an explicit
8116      authentication instruction.
8117
8118         1) Sibcalls don't return in a normal way, so if we're about to call one
8119            we must authenticate.
8120
8121         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
8122            generating code for !TARGET_ARMV8_3 we can't use it and must
8123            explicitly authenticate.
8124
8125         3) On an eh_return path we make extra stack adjustments to update the
8126            canonical frame address to be the exception handler's CFA.  We want
8127            to authenticate using the CFA of the function which calls eh_return.
8128     */
8129   if (aarch64_return_address_signing_enabled ()
8130       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
8131     {
8132       switch (aarch64_ra_sign_key)
8133         {
8134           case AARCH64_KEY_A:
8135             insn = emit_insn (gen_autiasp ());
8136             break;
8137           case AARCH64_KEY_B:
8138             insn = emit_insn (gen_autibsp ());
8139             break;
8140           default:
8141             gcc_unreachable ();
8142         }
8143       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
8144       RTX_FRAME_RELATED_P (insn) = 1;
8145     }
8146
8147   /* Stack adjustment for exception handler.  */
8148   if (crtl->calls_eh_return && !for_sibcall)
8149     {
8150       /* We need to unwind the stack by the offset computed by
8151          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
8152          to be SP; letting the CFA move during this adjustment
8153          is just as correct as retaining the CFA from the body
8154          of the function.  Therefore, do nothing special.  */
8155       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
8156     }
8157
8158   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
8159   if (!for_sibcall)
8160     emit_jump_insn (ret_rtx);
8161 }
8162
8163 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
8164    normally or return to a previous frame after unwinding.
8165
8166    An EH return uses a single shared return sequence.  The epilogue is
8167    exactly like a normal epilogue except that it has an extra input
8168    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
8169    that must be applied after the frame has been destroyed.  An extra label
8170    is inserted before the epilogue which initializes this register to zero,
8171    and this is the entry point for a normal return.
8172
8173    An actual EH return updates the return address, initializes the stack
8174    adjustment and jumps directly into the epilogue (bypassing the zeroing
8175    of the adjustment).  Since the return address is typically saved on the
8176    stack when a function makes a call, the saved LR must be updated outside
8177    the epilogue.
8178
8179    This poses problems as the store is generated well before the epilogue,
8180    so the offset of LR is not known yet.  Also optimizations will remove the
8181    store as it appears dead, even after the epilogue is generated (as the
8182    base or offset for loading LR is different in many cases).
8183
8184    To avoid these problems this implementation forces the frame pointer
8185    in eh_return functions so that the location of LR is fixed and known early.
8186    It also marks the store volatile, so no optimization is permitted to
8187    remove the store.  */
8188 rtx
8189 aarch64_eh_return_handler_rtx (void)
8190 {
8191   rtx tmp = gen_frame_mem (Pmode,
8192     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
8193
8194   /* Mark the store volatile, so no optimization is permitted to remove it.  */
8195   MEM_VOLATILE_P (tmp) = true;
8196   return tmp;
8197 }
8198
8199 /* Output code to add DELTA to the first argument, and then jump
8200    to FUNCTION.  Used for C++ multiple inheritance.  */
8201 static void
8202 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
8203                          HOST_WIDE_INT delta,
8204                          HOST_WIDE_INT vcall_offset,
8205                          tree function)
8206 {
8207   /* The this pointer is always in x0.  Note that this differs from
8208      Arm where the this pointer maybe bumped to r1 if r0 is required
8209      to return a pointer to an aggregate.  On AArch64 a result value
8210      pointer will be in x8.  */
8211   int this_regno = R0_REGNUM;
8212   rtx this_rtx, temp0, temp1, addr, funexp;
8213   rtx_insn *insn;
8214   const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
8215
8216   if (aarch64_bti_enabled ())
8217     emit_insn (gen_bti_c());
8218
8219   reload_completed = 1;
8220   emit_note (NOTE_INSN_PROLOGUE_END);
8221
8222   this_rtx = gen_rtx_REG (Pmode, this_regno);
8223   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
8224   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
8225
8226   if (vcall_offset == 0)
8227     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
8228   else
8229     {
8230       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
8231
8232       addr = this_rtx;
8233       if (delta != 0)
8234         {
8235           if (delta >= -256 && delta < 256)
8236             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
8237                                        plus_constant (Pmode, this_rtx, delta));
8238           else
8239             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
8240                                 temp1, temp0, false);
8241         }
8242
8243       if (Pmode == ptr_mode)
8244         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
8245       else
8246         aarch64_emit_move (temp0,
8247                            gen_rtx_ZERO_EXTEND (Pmode,
8248                                                 gen_rtx_MEM (ptr_mode, addr)));
8249
8250       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
8251           addr = plus_constant (Pmode, temp0, vcall_offset);
8252       else
8253         {
8254           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
8255                                           Pmode);
8256           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
8257         }
8258
8259       if (Pmode == ptr_mode)
8260         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
8261       else
8262         aarch64_emit_move (temp1,
8263                            gen_rtx_SIGN_EXTEND (Pmode,
8264                                                 gen_rtx_MEM (ptr_mode, addr)));
8265
8266       emit_insn (gen_add2_insn (this_rtx, temp1));
8267     }
8268
8269   /* Generate a tail call to the target function.  */
8270   if (!TREE_USED (function))
8271     {
8272       assemble_external (function);
8273       TREE_USED (function) = 1;
8274     }
8275   funexp = XEXP (DECL_RTL (function), 0);
8276   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
8277   rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode);
8278   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
8279   SIBLING_CALL_P (insn) = 1;
8280
8281   insn = get_insns ();
8282   shorten_branches (insn);
8283
8284   assemble_start_function (thunk, fnname);
8285   final_start_function (insn, file, 1);
8286   final (insn, file, 1);
8287   final_end_function ();
8288   assemble_end_function (thunk, fnname);
8289
8290   /* Stop pretending to be a post-reload pass.  */
8291   reload_completed = 0;
8292 }
8293
8294 static bool
8295 aarch64_tls_referenced_p (rtx x)
8296 {
8297   if (!TARGET_HAVE_TLS)
8298     return false;
8299   subrtx_iterator::array_type array;
8300   FOR_EACH_SUBRTX (iter, array, x, ALL)
8301     {
8302       const_rtx x = *iter;
8303       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
8304         return true;
8305       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
8306          TLS offsets, not real symbol references.  */
8307       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8308         iter.skip_subrtxes ();
8309     }
8310   return false;
8311 }
8312
8313
8314 /* Return true if val can be encoded as a 12-bit unsigned immediate with
8315    a left shift of 0 or 12 bits.  */
8316 bool
8317 aarch64_uimm12_shift (HOST_WIDE_INT val)
8318 {
8319   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
8320           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
8321           );
8322 }
8323
8324 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
8325    that can be created with a left shift of 0 or 12.  */
8326 static HOST_WIDE_INT
8327 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
8328 {
8329   /* Check to see if the value fits in 24 bits, as that is the maximum we can
8330      handle correctly.  */
8331   gcc_assert ((val & 0xffffff) == val);
8332
8333   if (((val & 0xfff) << 0) == val)
8334     return val;
8335
8336   return val & (0xfff << 12);
8337 }
8338
8339 /* Return true if val is an immediate that can be loaded into a
8340    register by a MOVZ instruction.  */
8341 static bool
8342 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
8343 {
8344   if (GET_MODE_SIZE (mode) > 4)
8345     {
8346       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
8347           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
8348         return 1;
8349     }
8350   else
8351     {
8352       /* Ignore sign extension.  */
8353       val &= (HOST_WIDE_INT) 0xffffffff;
8354     }
8355   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
8356           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
8357 }
8358
8359 /* Test whether:
8360
8361      X = (X & AND_VAL) | IOR_VAL;
8362
8363    can be implemented using:
8364
8365      MOVK X, #(IOR_VAL >> shift), LSL #shift
8366
8367    Return the shift if so, otherwise return -1.  */
8368 int
8369 aarch64_movk_shift (const wide_int_ref &and_val,
8370                     const wide_int_ref &ior_val)
8371 {
8372   unsigned int precision = and_val.get_precision ();
8373   unsigned HOST_WIDE_INT mask = 0xffff;
8374   for (unsigned int shift = 0; shift < precision; shift += 16)
8375     {
8376       if (and_val == ~mask && (ior_val & mask) == ior_val)
8377         return shift;
8378       mask <<= 16;
8379     }
8380   return -1;
8381 }
8382
8383 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
8384    64-bit (DImode) integer.  */
8385
8386 static unsigned HOST_WIDE_INT
8387 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
8388 {
8389   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
8390   while (size < 64)
8391     {
8392       val &= (HOST_WIDE_INT_1U << size) - 1;
8393       val |= val << size;
8394       size *= 2;
8395     }
8396   return val;
8397 }
8398
8399 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
8400
8401 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
8402   {
8403     0x0000000100000001ull,
8404     0x0001000100010001ull,
8405     0x0101010101010101ull,
8406     0x1111111111111111ull,
8407     0x5555555555555555ull,
8408   };
8409
8410
8411 /* Return true if val is a valid bitmask immediate.  */
8412
8413 bool
8414 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
8415 {
8416   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
8417   int bits;
8418
8419   /* Check for a single sequence of one bits and return quickly if so.
8420      The special cases of all ones and all zeroes returns false.  */
8421   val = aarch64_replicate_bitmask_imm (val_in, mode);
8422   tmp = val + (val & -val);
8423
8424   if (tmp == (tmp & -tmp))
8425     return (val + 1) > 1;
8426
8427   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
8428   if (mode == SImode)
8429     val = (val << 32) | (val & 0xffffffff);
8430
8431   /* Invert if the immediate doesn't start with a zero bit - this means we
8432      only need to search for sequences of one bits.  */
8433   if (val & 1)
8434     val = ~val;
8435
8436   /* Find the first set bit and set tmp to val with the first sequence of one
8437      bits removed.  Return success if there is a single sequence of ones.  */
8438   first_one = val & -val;
8439   tmp = val & (val + first_one);
8440
8441   if (tmp == 0)
8442     return true;
8443
8444   /* Find the next set bit and compute the difference in bit position.  */
8445   next_one = tmp & -tmp;
8446   bits = clz_hwi (first_one) - clz_hwi (next_one);
8447   mask = val ^ tmp;
8448
8449   /* Check the bit position difference is a power of 2, and that the first
8450      sequence of one bits fits within 'bits' bits.  */
8451   if ((mask >> bits) != 0 || bits != (bits & -bits))
8452     return false;
8453
8454   /* Check the sequence of one bits is repeated 64/bits times.  */
8455   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
8456 }
8457
8458 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
8459    Assumed precondition: VAL_IN Is not zero.  */
8460
8461 unsigned HOST_WIDE_INT
8462 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
8463 {
8464   int lowest_bit_set = ctz_hwi (val_in);
8465   int highest_bit_set = floor_log2 (val_in);
8466   gcc_assert (val_in != 0);
8467
8468   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
8469           (HOST_WIDE_INT_1U << lowest_bit_set));
8470 }
8471
8472 /* Create constant where bits outside of lowest bit set to highest bit set
8473    are set to 1.  */
8474
8475 unsigned HOST_WIDE_INT
8476 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
8477 {
8478   return val_in | ~aarch64_and_split_imm1 (val_in);
8479 }
8480
8481 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
8482
8483 bool
8484 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
8485 {
8486   scalar_int_mode int_mode;
8487   if (!is_a <scalar_int_mode> (mode, &int_mode))
8488     return false;
8489
8490   if (aarch64_bitmask_imm (val_in, int_mode))
8491     return false;
8492
8493   if (aarch64_move_imm (val_in, int_mode))
8494     return false;
8495
8496   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
8497
8498   return aarch64_bitmask_imm (imm2, int_mode);
8499 }
8500
8501 /* Return true if val is an immediate that can be loaded into a
8502    register in a single instruction.  */
8503 bool
8504 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
8505 {
8506   scalar_int_mode int_mode;
8507   if (!is_a <scalar_int_mode> (mode, &int_mode))
8508     return false;
8509
8510   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
8511     return 1;
8512   return aarch64_bitmask_imm (val, int_mode);
8513 }
8514
8515 static bool
8516 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
8517 {
8518   rtx base, offset;
8519
8520   if (GET_CODE (x) == HIGH)
8521     return true;
8522
8523   /* There's no way to calculate VL-based values using relocations.  */
8524   subrtx_iterator::array_type array;
8525   FOR_EACH_SUBRTX (iter, array, x, ALL)
8526     if (GET_CODE (*iter) == CONST_POLY_INT)
8527       return true;
8528
8529   split_const (x, &base, &offset);
8530   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
8531     {
8532       if (aarch64_classify_symbol (base, INTVAL (offset))
8533           != SYMBOL_FORCE_TO_MEM)
8534         return true;
8535       else
8536         /* Avoid generating a 64-bit relocation in ILP32; leave
8537            to aarch64_expand_mov_immediate to handle it properly.  */
8538         return mode != ptr_mode;
8539     }
8540
8541   return aarch64_tls_referenced_p (x);
8542 }
8543
8544 /* Implement TARGET_CASE_VALUES_THRESHOLD.
8545    The expansion for a table switch is quite expensive due to the number
8546    of instructions, the table lookup and hard to predict indirect jump.
8547    When optimizing for speed, and -O3 enabled, use the per-core tuning if
8548    set, otherwise use tables for > 16 cases as a tradeoff between size and
8549    performance.  When optimizing for size, use the default setting.  */
8550
8551 static unsigned int
8552 aarch64_case_values_threshold (void)
8553 {
8554   /* Use the specified limit for the number of cases before using jump
8555      tables at higher optimization levels.  */
8556   if (optimize > 2
8557       && selected_cpu->tune->max_case_values != 0)
8558     return selected_cpu->tune->max_case_values;
8559   else
8560     return optimize_size ? default_case_values_threshold () : 17;
8561 }
8562
8563 /* Return true if register REGNO is a valid index register.
8564    STRICT_P is true if REG_OK_STRICT is in effect.  */
8565
8566 bool
8567 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
8568 {
8569   if (!HARD_REGISTER_NUM_P (regno))
8570     {
8571       if (!strict_p)
8572         return true;
8573
8574       if (!reg_renumber)
8575         return false;
8576
8577       regno = reg_renumber[regno];
8578     }
8579   return GP_REGNUM_P (regno);
8580 }
8581
8582 /* Return true if register REGNO is a valid base register for mode MODE.
8583    STRICT_P is true if REG_OK_STRICT is in effect.  */
8584
8585 bool
8586 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
8587 {
8588   if (!HARD_REGISTER_NUM_P (regno))
8589     {
8590       if (!strict_p)
8591         return true;
8592
8593       if (!reg_renumber)
8594         return false;
8595
8596       regno = reg_renumber[regno];
8597     }
8598
8599   /* The fake registers will be eliminated to either the stack or
8600      hard frame pointer, both of which are usually valid base registers.
8601      Reload deals with the cases where the eliminated form isn't valid.  */
8602   return (GP_REGNUM_P (regno)
8603           || regno == SP_REGNUM
8604           || regno == FRAME_POINTER_REGNUM
8605           || regno == ARG_POINTER_REGNUM);
8606 }
8607
8608 /* Return true if X is a valid base register for mode MODE.
8609    STRICT_P is true if REG_OK_STRICT is in effect.  */
8610
8611 static bool
8612 aarch64_base_register_rtx_p (rtx x, bool strict_p)
8613 {
8614   if (!strict_p
8615       && GET_CODE (x) == SUBREG
8616       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
8617     x = SUBREG_REG (x);
8618
8619   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
8620 }
8621
8622 /* Return true if address offset is a valid index.  If it is, fill in INFO
8623    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
8624
8625 static bool
8626 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
8627                         machine_mode mode, bool strict_p)
8628 {
8629   enum aarch64_address_type type;
8630   rtx index;
8631   int shift;
8632
8633   /* (reg:P) */
8634   if ((REG_P (x) || GET_CODE (x) == SUBREG)
8635       && GET_MODE (x) == Pmode)
8636     {
8637       type = ADDRESS_REG_REG;
8638       index = x;
8639       shift = 0;
8640     }
8641   /* (sign_extend:DI (reg:SI)) */
8642   else if ((GET_CODE (x) == SIGN_EXTEND
8643             || GET_CODE (x) == ZERO_EXTEND)
8644            && GET_MODE (x) == DImode
8645            && GET_MODE (XEXP (x, 0)) == SImode)
8646     {
8647       type = (GET_CODE (x) == SIGN_EXTEND)
8648         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8649       index = XEXP (x, 0);
8650       shift = 0;
8651     }
8652   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
8653   else if (GET_CODE (x) == MULT
8654            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
8655                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
8656            && GET_MODE (XEXP (x, 0)) == DImode
8657            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
8658            && CONST_INT_P (XEXP (x, 1)))
8659     {
8660       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
8661         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8662       index = XEXP (XEXP (x, 0), 0);
8663       shift = exact_log2 (INTVAL (XEXP (x, 1)));
8664     }
8665   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
8666   else if (GET_CODE (x) == ASHIFT
8667            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
8668                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
8669            && GET_MODE (XEXP (x, 0)) == DImode
8670            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
8671            && CONST_INT_P (XEXP (x, 1)))
8672     {
8673       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
8674         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8675       index = XEXP (XEXP (x, 0), 0);
8676       shift = INTVAL (XEXP (x, 1));
8677     }
8678   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
8679   else if ((GET_CODE (x) == SIGN_EXTRACT
8680             || GET_CODE (x) == ZERO_EXTRACT)
8681            && GET_MODE (x) == DImode
8682            && GET_CODE (XEXP (x, 0)) == MULT
8683            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8684            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
8685     {
8686       type = (GET_CODE (x) == SIGN_EXTRACT)
8687         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8688       index = XEXP (XEXP (x, 0), 0);
8689       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
8690       if (INTVAL (XEXP (x, 1)) != 32 + shift
8691           || INTVAL (XEXP (x, 2)) != 0)
8692         shift = -1;
8693     }
8694   /* (and:DI (mult:DI (reg:DI) (const_int scale))
8695      (const_int 0xffffffff<<shift)) */
8696   else if (GET_CODE (x) == AND
8697            && GET_MODE (x) == DImode
8698            && GET_CODE (XEXP (x, 0)) == MULT
8699            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8700            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8701            && CONST_INT_P (XEXP (x, 1)))
8702     {
8703       type = ADDRESS_REG_UXTW;
8704       index = XEXP (XEXP (x, 0), 0);
8705       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
8706       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
8707         shift = -1;
8708     }
8709   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
8710   else if ((GET_CODE (x) == SIGN_EXTRACT
8711             || GET_CODE (x) == ZERO_EXTRACT)
8712            && GET_MODE (x) == DImode
8713            && GET_CODE (XEXP (x, 0)) == ASHIFT
8714            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8715            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
8716     {
8717       type = (GET_CODE (x) == SIGN_EXTRACT)
8718         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8719       index = XEXP (XEXP (x, 0), 0);
8720       shift = INTVAL (XEXP (XEXP (x, 0), 1));
8721       if (INTVAL (XEXP (x, 1)) != 32 + shift
8722           || INTVAL (XEXP (x, 2)) != 0)
8723         shift = -1;
8724     }
8725   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
8726      (const_int 0xffffffff<<shift)) */
8727   else if (GET_CODE (x) == AND
8728            && GET_MODE (x) == DImode
8729            && GET_CODE (XEXP (x, 0)) == ASHIFT
8730            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8731            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8732            && CONST_INT_P (XEXP (x, 1)))
8733     {
8734       type = ADDRESS_REG_UXTW;
8735       index = XEXP (XEXP (x, 0), 0);
8736       shift = INTVAL (XEXP (XEXP (x, 0), 1));
8737       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
8738         shift = -1;
8739     }
8740   /* (mult:P (reg:P) (const_int scale)) */
8741   else if (GET_CODE (x) == MULT
8742            && GET_MODE (x) == Pmode
8743            && GET_MODE (XEXP (x, 0)) == Pmode
8744            && CONST_INT_P (XEXP (x, 1)))
8745     {
8746       type = ADDRESS_REG_REG;
8747       index = XEXP (x, 0);
8748       shift = exact_log2 (INTVAL (XEXP (x, 1)));
8749     }
8750   /* (ashift:P (reg:P) (const_int shift)) */
8751   else if (GET_CODE (x) == ASHIFT
8752            && GET_MODE (x) == Pmode
8753            && GET_MODE (XEXP (x, 0)) == Pmode
8754            && CONST_INT_P (XEXP (x, 1)))
8755     {
8756       type = ADDRESS_REG_REG;
8757       index = XEXP (x, 0);
8758       shift = INTVAL (XEXP (x, 1));
8759     }
8760   else
8761     return false;
8762
8763   if (!strict_p
8764       && GET_CODE (index) == SUBREG
8765       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
8766     index = SUBREG_REG (index);
8767
8768   if (aarch64_sve_data_mode_p (mode))
8769     {
8770       if (type != ADDRESS_REG_REG
8771           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
8772         return false;
8773     }
8774   else
8775     {
8776       if (shift != 0
8777           && !(IN_RANGE (shift, 1, 3)
8778                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
8779         return false;
8780     }
8781
8782   if (REG_P (index)
8783       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
8784     {
8785       info->type = type;
8786       info->offset = index;
8787       info->shift = shift;
8788       return true;
8789     }
8790
8791   return false;
8792 }
8793
8794 /* Return true if MODE is one of the modes for which we
8795    support LDP/STP operations.  */
8796
8797 static bool
8798 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
8799 {
8800   return mode == SImode || mode == DImode
8801          || mode == SFmode || mode == DFmode
8802          || (aarch64_vector_mode_supported_p (mode)
8803              && (known_eq (GET_MODE_SIZE (mode), 8)
8804                  || (known_eq (GET_MODE_SIZE (mode), 16)
8805                     && (aarch64_tune_params.extra_tuning_flags
8806                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
8807 }
8808
8809 /* Return true if REGNO is a virtual pointer register, or an eliminable
8810    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
8811    include stack_pointer or hard_frame_pointer.  */
8812 static bool
8813 virt_or_elim_regno_p (unsigned regno)
8814 {
8815   return ((regno >= FIRST_VIRTUAL_REGISTER
8816            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
8817           || regno == FRAME_POINTER_REGNUM
8818           || regno == ARG_POINTER_REGNUM);
8819 }
8820
8821 /* Return true if X is a valid address of type TYPE for machine mode MODE.
8822    If it is, fill in INFO appropriately.  STRICT_P is true if
8823    REG_OK_STRICT is in effect.  */
8824
8825 bool
8826 aarch64_classify_address (struct aarch64_address_info *info,
8827                           rtx x, machine_mode mode, bool strict_p,
8828                           aarch64_addr_query_type type)
8829 {
8830   enum rtx_code code = GET_CODE (x);
8831   rtx op0, op1;
8832   poly_int64 offset;
8833
8834   HOST_WIDE_INT const_size;
8835
8836   /* Whether a vector mode is partial doesn't affect address legitimacy.
8837      Partial vectors like VNx8QImode allow the same indexed addressing
8838      mode and MUL VL addressing mode as full vectors like VNx16QImode;
8839      in both cases, MUL VL counts multiples of GET_MODE_SIZE.  */
8840   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
8841   vec_flags &= ~VEC_PARTIAL;
8842
8843   /* On BE, we use load/store pair for all large int mode load/stores.
8844      TI/TFmode may also use a load/store pair.  */
8845   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
8846   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
8847                             || type == ADDR_QUERY_LDP_STP_N
8848                             || mode == TImode
8849                             || mode == TFmode
8850                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
8851
8852   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
8853      corresponds to the actual size of the memory being loaded/stored and the
8854      mode of the corresponding addressing mode is half of that.  */
8855   if (type == ADDR_QUERY_LDP_STP_N
8856       && known_eq (GET_MODE_SIZE (mode), 16))
8857     mode = DFmode;
8858
8859   bool allow_reg_index_p = (!load_store_pair_p
8860                             && (known_lt (GET_MODE_SIZE (mode), 16)
8861                                 || vec_flags == VEC_ADVSIMD
8862                                 || vec_flags & VEC_SVE_DATA));
8863
8864   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
8865      [Rn, #offset, MUL VL].  */
8866   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
8867       && (code != REG && code != PLUS))
8868     return false;
8869
8870   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
8871      REG addressing.  */
8872   if (advsimd_struct_p
8873       && !BYTES_BIG_ENDIAN
8874       && (code != POST_INC && code != REG))
8875     return false;
8876
8877   gcc_checking_assert (GET_MODE (x) == VOIDmode
8878                        || SCALAR_INT_MODE_P (GET_MODE (x)));
8879
8880   switch (code)
8881     {
8882     case REG:
8883     case SUBREG:
8884       info->type = ADDRESS_REG_IMM;
8885       info->base = x;
8886       info->offset = const0_rtx;
8887       info->const_offset = 0;
8888       return aarch64_base_register_rtx_p (x, strict_p);
8889
8890     case PLUS:
8891       op0 = XEXP (x, 0);
8892       op1 = XEXP (x, 1);
8893
8894       if (! strict_p
8895           && REG_P (op0)
8896           && virt_or_elim_regno_p (REGNO (op0))
8897           && poly_int_rtx_p (op1, &offset))
8898         {
8899           info->type = ADDRESS_REG_IMM;
8900           info->base = op0;
8901           info->offset = op1;
8902           info->const_offset = offset;
8903
8904           return true;
8905         }
8906
8907       if (maybe_ne (GET_MODE_SIZE (mode), 0)
8908           && aarch64_base_register_rtx_p (op0, strict_p)
8909           && poly_int_rtx_p (op1, &offset))
8910         {
8911           info->type = ADDRESS_REG_IMM;
8912           info->base = op0;
8913           info->offset = op1;
8914           info->const_offset = offset;
8915
8916           /* TImode and TFmode values are allowed in both pairs of X
8917              registers and individual Q registers.  The available
8918              address modes are:
8919              X,X: 7-bit signed scaled offset
8920              Q:   9-bit signed offset
8921              We conservatively require an offset representable in either mode.
8922              When performing the check for pairs of X registers i.e.  LDP/STP
8923              pass down DImode since that is the natural size of the LDP/STP
8924              instruction memory accesses.  */
8925           if (mode == TImode || mode == TFmode)
8926             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
8927                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
8928                         || offset_12bit_unsigned_scaled_p (mode, offset)));
8929
8930           /* A 7bit offset check because OImode will emit a ldp/stp
8931              instruction (only big endian will get here).
8932              For ldp/stp instructions, the offset is scaled for the size of a
8933              single element of the pair.  */
8934           if (mode == OImode)
8935             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
8936
8937           /* Three 9/12 bit offsets checks because CImode will emit three
8938              ldr/str instructions (only big endian will get here).  */
8939           if (mode == CImode)
8940             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
8941                     && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
8942                                                                offset + 32)
8943                         || offset_12bit_unsigned_scaled_p (V16QImode,
8944                                                            offset + 32)));
8945
8946           /* Two 7bit offsets checks because XImode will emit two ldp/stp
8947              instructions (only big endian will get here).  */
8948           if (mode == XImode)
8949             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
8950                     && aarch64_offset_7bit_signed_scaled_p (TImode,
8951                                                             offset + 32));
8952
8953           /* Make "m" use the LD1 offset range for SVE data modes, so
8954              that pre-RTL optimizers like ivopts will work to that
8955              instead of the wider LDR/STR range.  */
8956           if (vec_flags == VEC_SVE_DATA)
8957             return (type == ADDR_QUERY_M
8958                     ? offset_4bit_signed_scaled_p (mode, offset)
8959                     : offset_9bit_signed_scaled_p (mode, offset));
8960
8961           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
8962             {
8963               poly_int64 end_offset = (offset
8964                                        + GET_MODE_SIZE (mode)
8965                                        - BYTES_PER_SVE_VECTOR);
8966               return (type == ADDR_QUERY_M
8967                       ? offset_4bit_signed_scaled_p (mode, offset)
8968                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
8969                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
8970                                                          end_offset)));
8971             }
8972
8973           if (vec_flags == VEC_SVE_PRED)
8974             return offset_9bit_signed_scaled_p (mode, offset);
8975
8976           if (load_store_pair_p)
8977             return ((known_eq (GET_MODE_SIZE (mode), 4)
8978                      || known_eq (GET_MODE_SIZE (mode), 8)
8979                      || known_eq (GET_MODE_SIZE (mode), 16))
8980                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
8981           else
8982             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
8983                     || offset_12bit_unsigned_scaled_p (mode, offset));
8984         }
8985
8986       if (allow_reg_index_p)
8987         {
8988           /* Look for base + (scaled/extended) index register.  */
8989           if (aarch64_base_register_rtx_p (op0, strict_p)
8990               && aarch64_classify_index (info, op1, mode, strict_p))
8991             {
8992               info->base = op0;
8993               return true;
8994             }
8995           if (aarch64_base_register_rtx_p (op1, strict_p)
8996               && aarch64_classify_index (info, op0, mode, strict_p))
8997             {
8998               info->base = op1;
8999               return true;
9000             }
9001         }
9002
9003       return false;
9004
9005     case POST_INC:
9006     case POST_DEC:
9007     case PRE_INC:
9008     case PRE_DEC:
9009       info->type = ADDRESS_REG_WB;
9010       info->base = XEXP (x, 0);
9011       info->offset = NULL_RTX;
9012       return aarch64_base_register_rtx_p (info->base, strict_p);
9013
9014     case POST_MODIFY:
9015     case PRE_MODIFY:
9016       info->type = ADDRESS_REG_WB;
9017       info->base = XEXP (x, 0);
9018       if (GET_CODE (XEXP (x, 1)) == PLUS
9019           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
9020           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
9021           && aarch64_base_register_rtx_p (info->base, strict_p))
9022         {
9023           info->offset = XEXP (XEXP (x, 1), 1);
9024           info->const_offset = offset;
9025
9026           /* TImode and TFmode values are allowed in both pairs of X
9027              registers and individual Q registers.  The available
9028              address modes are:
9029              X,X: 7-bit signed scaled offset
9030              Q:   9-bit signed offset
9031              We conservatively require an offset representable in either mode.
9032            */
9033           if (mode == TImode || mode == TFmode)
9034             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
9035                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
9036
9037           if (load_store_pair_p)
9038             return ((known_eq (GET_MODE_SIZE (mode), 4)
9039                      || known_eq (GET_MODE_SIZE (mode), 8)
9040                      || known_eq (GET_MODE_SIZE (mode), 16))
9041                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
9042           else
9043             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
9044         }
9045       return false;
9046
9047     case CONST:
9048     case SYMBOL_REF:
9049     case LABEL_REF:
9050       /* load literal: pc-relative constant pool entry.  Only supported
9051          for SI mode or larger.  */
9052       info->type = ADDRESS_SYMBOLIC;
9053
9054       if (!load_store_pair_p
9055           && GET_MODE_SIZE (mode).is_constant (&const_size)
9056           && const_size >= 4)
9057         {
9058           rtx sym, addend;
9059
9060           split_const (x, &sym, &addend);
9061           return ((GET_CODE (sym) == LABEL_REF
9062                    || (GET_CODE (sym) == SYMBOL_REF
9063                        && CONSTANT_POOL_ADDRESS_P (sym)
9064                        && aarch64_pcrelative_literal_loads)));
9065         }
9066       return false;
9067
9068     case LO_SUM:
9069       info->type = ADDRESS_LO_SUM;
9070       info->base = XEXP (x, 0);
9071       info->offset = XEXP (x, 1);
9072       if (allow_reg_index_p
9073           && aarch64_base_register_rtx_p (info->base, strict_p))
9074         {
9075           rtx sym, offs;
9076           split_const (info->offset, &sym, &offs);
9077           if (GET_CODE (sym) == SYMBOL_REF
9078               && (aarch64_classify_symbol (sym, INTVAL (offs))
9079                   == SYMBOL_SMALL_ABSOLUTE))
9080             {
9081               /* The symbol and offset must be aligned to the access size.  */
9082               unsigned int align;
9083
9084               if (CONSTANT_POOL_ADDRESS_P (sym))
9085                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
9086               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
9087                 {
9088                   tree exp = SYMBOL_REF_DECL (sym);
9089                   align = TYPE_ALIGN (TREE_TYPE (exp));
9090                   align = aarch64_constant_alignment (exp, align);
9091                 }
9092               else if (SYMBOL_REF_DECL (sym))
9093                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
9094               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
9095                        && SYMBOL_REF_BLOCK (sym) != NULL)
9096                 align = SYMBOL_REF_BLOCK (sym)->alignment;
9097               else
9098                 align = BITS_PER_UNIT;
9099
9100               poly_int64 ref_size = GET_MODE_SIZE (mode);
9101               if (known_eq (ref_size, 0))
9102                 ref_size = GET_MODE_SIZE (DImode);
9103
9104               return (multiple_p (INTVAL (offs), ref_size)
9105                       && multiple_p (align / BITS_PER_UNIT, ref_size));
9106             }
9107         }
9108       return false;
9109
9110     default:
9111       return false;
9112     }
9113 }
9114
9115 /* Return true if the address X is valid for a PRFM instruction.
9116    STRICT_P is true if we should do strict checking with
9117    aarch64_classify_address.  */
9118
9119 bool
9120 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
9121 {
9122   struct aarch64_address_info addr;
9123
9124   /* PRFM accepts the same addresses as DImode...  */
9125   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
9126   if (!res)
9127     return false;
9128
9129   /* ... except writeback forms.  */
9130   return addr.type != ADDRESS_REG_WB;
9131 }
9132
9133 bool
9134 aarch64_symbolic_address_p (rtx x)
9135 {
9136   rtx offset;
9137
9138   split_const (x, &x, &offset);
9139   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
9140 }
9141
9142 /* Classify the base of symbolic expression X.  */
9143
9144 enum aarch64_symbol_type
9145 aarch64_classify_symbolic_expression (rtx x)
9146 {
9147   rtx offset;
9148
9149   split_const (x, &x, &offset);
9150   return aarch64_classify_symbol (x, INTVAL (offset));
9151 }
9152
9153
9154 /* Return TRUE if X is a legitimate address for accessing memory in
9155    mode MODE.  */
9156 static bool
9157 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
9158 {
9159   struct aarch64_address_info addr;
9160
9161   return aarch64_classify_address (&addr, x, mode, strict_p);
9162 }
9163
9164 /* Return TRUE if X is a legitimate address of type TYPE for accessing
9165    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
9166 bool
9167 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
9168                               aarch64_addr_query_type type)
9169 {
9170   struct aarch64_address_info addr;
9171
9172   return aarch64_classify_address (&addr, x, mode, strict_p, type);
9173 }
9174
9175 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
9176
9177 static bool
9178 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
9179                                          poly_int64 orig_offset,
9180                                          machine_mode mode)
9181 {
9182   HOST_WIDE_INT size;
9183   if (GET_MODE_SIZE (mode).is_constant (&size))
9184     {
9185       HOST_WIDE_INT const_offset, second_offset;
9186
9187       /* A general SVE offset is A * VQ + B.  Remove the A component from
9188          coefficient 0 in order to get the constant B.  */
9189       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
9190
9191       /* Split an out-of-range address displacement into a base and
9192          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
9193          range otherwise to increase opportunities for sharing the base
9194          address of different sizes.  Unaligned accesses use the signed
9195          9-bit range, TImode/TFmode use the intersection of signed
9196          scaled 7-bit and signed 9-bit offset.  */
9197       if (mode == TImode || mode == TFmode)
9198         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
9199       else if ((const_offset & (size - 1)) != 0)
9200         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
9201       else
9202         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
9203
9204       if (second_offset == 0 || known_eq (orig_offset, second_offset))
9205         return false;
9206
9207       /* Split the offset into second_offset and the rest.  */
9208       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
9209       *offset2 = gen_int_mode (second_offset, Pmode);
9210       return true;
9211     }
9212   else
9213     {
9214       /* Get the mode we should use as the basis of the range.  For structure
9215          modes this is the mode of one vector.  */
9216       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
9217       machine_mode step_mode
9218         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
9219
9220       /* Get the "mul vl" multiplier we'd like to use.  */
9221       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
9222       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
9223       if (vec_flags & VEC_SVE_DATA)
9224         /* LDR supports a 9-bit range, but the move patterns for
9225            structure modes require all vectors to be in range of the
9226            same base.  The simplest way of accomodating that while still
9227            promoting reuse of anchor points between different modes is
9228            to use an 8-bit range unconditionally.  */
9229         vnum = ((vnum + 128) & 255) - 128;
9230       else
9231         /* Predicates are only handled singly, so we might as well use
9232            the full range.  */
9233         vnum = ((vnum + 256) & 511) - 256;
9234       if (vnum == 0)
9235         return false;
9236
9237       /* Convert the "mul vl" multiplier into a byte offset.  */
9238       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
9239       if (known_eq (second_offset, orig_offset))
9240         return false;
9241
9242       /* Split the offset into second_offset and the rest.  */
9243       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
9244       *offset2 = gen_int_mode (second_offset, Pmode);
9245       return true;
9246     }
9247 }
9248
9249 /* Return the binary representation of floating point constant VALUE in INTVAL.
9250    If the value cannot be converted, return false without setting INTVAL.
9251    The conversion is done in the given MODE.  */
9252 bool
9253 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
9254 {
9255
9256   /* We make a general exception for 0.  */
9257   if (aarch64_float_const_zero_rtx_p (value))
9258     {
9259       *intval = 0;
9260       return true;
9261     }
9262
9263   scalar_float_mode mode;
9264   if (GET_CODE (value) != CONST_DOUBLE
9265       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
9266       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
9267       /* Only support up to DF mode.  */
9268       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
9269     return false;
9270
9271   unsigned HOST_WIDE_INT ival = 0;
9272
9273   long res[2];
9274   real_to_target (res,
9275                   CONST_DOUBLE_REAL_VALUE (value),
9276                   REAL_MODE_FORMAT (mode));
9277
9278   if (mode == DFmode)
9279     {
9280       int order = BYTES_BIG_ENDIAN ? 1 : 0;
9281       ival = zext_hwi (res[order], 32);
9282       ival |= (zext_hwi (res[1 - order], 32) << 32);
9283     }
9284   else
9285       ival = zext_hwi (res[0], 32);
9286
9287   *intval = ival;
9288   return true;
9289 }
9290
9291 /* Return TRUE if rtx X is an immediate constant that can be moved using a
9292    single MOV(+MOVK) followed by an FMOV.  */
9293 bool
9294 aarch64_float_const_rtx_p (rtx x)
9295 {
9296   machine_mode mode = GET_MODE (x);
9297   if (mode == VOIDmode)
9298     return false;
9299
9300   /* Determine whether it's cheaper to write float constants as
9301      mov/movk pairs over ldr/adrp pairs.  */
9302   unsigned HOST_WIDE_INT ival;
9303
9304   if (GET_CODE (x) == CONST_DOUBLE
9305       && SCALAR_FLOAT_MODE_P (mode)
9306       && aarch64_reinterpret_float_as_int (x, &ival))
9307     {
9308       scalar_int_mode imode = (mode == HFmode
9309                                ? SImode
9310                                : int_mode_for_mode (mode).require ());
9311       int num_instr = aarch64_internal_mov_immediate
9312                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
9313       return num_instr < 3;
9314     }
9315
9316   return false;
9317 }
9318
9319 /* Return TRUE if rtx X is immediate constant 0.0 */
9320 bool
9321 aarch64_float_const_zero_rtx_p (rtx x)
9322 {
9323   if (GET_MODE (x) == VOIDmode)
9324     return false;
9325
9326   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
9327     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
9328   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
9329 }
9330
9331 /* Return TRUE if rtx X is immediate constant that fits in a single
9332    MOVI immediate operation.  */
9333 bool
9334 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
9335 {
9336   if (!TARGET_SIMD)
9337      return false;
9338
9339   machine_mode vmode;
9340   scalar_int_mode imode;
9341   unsigned HOST_WIDE_INT ival;
9342
9343   if (GET_CODE (x) == CONST_DOUBLE
9344       && SCALAR_FLOAT_MODE_P (mode))
9345     {
9346       if (!aarch64_reinterpret_float_as_int (x, &ival))
9347         return false;
9348
9349       /* We make a general exception for 0.  */
9350       if (aarch64_float_const_zero_rtx_p (x))
9351         return true;
9352
9353       imode = int_mode_for_mode (mode).require ();
9354     }
9355   else if (GET_CODE (x) == CONST_INT
9356            && is_a <scalar_int_mode> (mode, &imode))
9357     ival = INTVAL (x);
9358   else
9359     return false;
9360
9361    /* use a 64 bit mode for everything except for DI/DF mode, where we use
9362      a 128 bit vector mode.  */
9363   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
9364
9365   vmode = aarch64_simd_container_mode (imode, width);
9366   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
9367
9368   return aarch64_simd_valid_immediate (v_op, NULL);
9369 }
9370
9371
9372 /* Return the fixed registers used for condition codes.  */
9373
9374 static bool
9375 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
9376 {
9377   *p1 = CC_REGNUM;
9378   *p2 = INVALID_REGNUM;
9379   return true;
9380 }
9381
9382 /* This function is used by the call expanders of the machine description.
9383    RESULT is the register in which the result is returned.  It's NULL for
9384    "call" and "sibcall".
9385    MEM is the location of the function call.
9386    CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
9387    SIBCALL indicates whether this function call is normal call or sibling call.
9388    It will generate different pattern accordingly.  */
9389
9390 void
9391 aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
9392 {
9393   rtx call, callee, tmp;
9394   rtvec vec;
9395   machine_mode mode;
9396
9397   gcc_assert (MEM_P (mem));
9398   callee = XEXP (mem, 0);
9399   mode = GET_MODE (callee);
9400   gcc_assert (mode == Pmode);
9401
9402   /* Decide if we should generate indirect calls by loading the
9403      address of the callee into a register before performing
9404      the branch-and-link.  */
9405   if (SYMBOL_REF_P (callee)
9406       ? (aarch64_is_long_call_p (callee)
9407          || aarch64_is_noplt_call_p (callee))
9408       : !REG_P (callee))
9409     XEXP (mem, 0) = force_reg (mode, callee);
9410
9411   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
9412
9413   if (result != NULL_RTX)
9414     call = gen_rtx_SET (result, call);
9415
9416   if (sibcall)
9417     tmp = ret_rtx;
9418   else
9419     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
9420
9421   gcc_assert (CONST_INT_P (callee_abi));
9422   callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
9423                                UNSPEC_CALLEE_ABI);
9424
9425   vec = gen_rtvec (3, call, callee_abi, tmp);
9426   call = gen_rtx_PARALLEL (VOIDmode, vec);
9427
9428   aarch64_emit_call_insn (call);
9429 }
9430
9431 /* Emit call insn with PAT and do aarch64-specific handling.  */
9432
9433 void
9434 aarch64_emit_call_insn (rtx pat)
9435 {
9436   rtx insn = emit_call_insn (pat);
9437
9438   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
9439   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
9440   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
9441 }
9442
9443 machine_mode
9444 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
9445 {
9446   machine_mode mode_x = GET_MODE (x);
9447   rtx_code code_x = GET_CODE (x);
9448
9449   /* All floating point compares return CCFP if it is an equality
9450      comparison, and CCFPE otherwise.  */
9451   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
9452     {
9453       switch (code)
9454         {
9455         case EQ:
9456         case NE:
9457         case UNORDERED:
9458         case ORDERED:
9459         case UNLT:
9460         case UNLE:
9461         case UNGT:
9462         case UNGE:
9463         case UNEQ:
9464           return CCFPmode;
9465
9466         case LT:
9467         case LE:
9468         case GT:
9469         case GE:
9470         case LTGT:
9471           return CCFPEmode;
9472
9473         default:
9474           gcc_unreachable ();
9475         }
9476     }
9477
9478   /* Equality comparisons of short modes against zero can be performed
9479      using the TST instruction with the appropriate bitmask.  */
9480   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
9481       && (code == EQ || code == NE)
9482       && (mode_x == HImode || mode_x == QImode))
9483     return CC_NZmode;
9484
9485   /* Similarly, comparisons of zero_extends from shorter modes can
9486      be performed using an ANDS with an immediate mask.  */
9487   if (y == const0_rtx && code_x == ZERO_EXTEND
9488       && (mode_x == SImode || mode_x == DImode)
9489       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
9490       && (code == EQ || code == NE))
9491     return CC_NZmode;
9492
9493   if ((mode_x == SImode || mode_x == DImode)
9494       && y == const0_rtx
9495       && (code == EQ || code == NE || code == LT || code == GE)
9496       && (code_x == PLUS || code_x == MINUS || code_x == AND
9497           || code_x == NEG
9498           || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
9499               && CONST_INT_P (XEXP (x, 2)))))
9500     return CC_NZmode;
9501
9502   /* A compare with a shifted operand.  Because of canonicalization,
9503      the comparison will have to be swapped when we emit the assembly
9504      code.  */
9505   if ((mode_x == SImode || mode_x == DImode)
9506       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
9507       && (code_x == ASHIFT || code_x == ASHIFTRT
9508           || code_x == LSHIFTRT
9509           || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
9510     return CC_SWPmode;
9511
9512   /* Similarly for a negated operand, but we can only do this for
9513      equalities.  */
9514   if ((mode_x == SImode || mode_x == DImode)
9515       && (REG_P (y) || GET_CODE (y) == SUBREG)
9516       && (code == EQ || code == NE)
9517       && code_x == NEG)
9518     return CC_Zmode;
9519
9520   /* A test for unsigned overflow from an addition.  */
9521   if ((mode_x == DImode || mode_x == TImode)
9522       && (code == LTU || code == GEU)
9523       && code_x == PLUS
9524       && rtx_equal_p (XEXP (x, 0), y))
9525     return CC_Cmode;
9526
9527   /* A test for unsigned overflow from an add with carry.  */
9528   if ((mode_x == DImode || mode_x == TImode)
9529       && (code == LTU || code == GEU)
9530       && code_x == PLUS
9531       && CONST_SCALAR_INT_P (y)
9532       && (rtx_mode_t (y, mode_x)
9533           == (wi::shwi (1, mode_x)
9534               << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
9535     return CC_ADCmode;
9536
9537   /* A test for signed overflow.  */
9538   if ((mode_x == DImode || mode_x == TImode)
9539       && code == NE
9540       && code_x == PLUS
9541       && GET_CODE (y) == SIGN_EXTEND)
9542     return CC_Vmode;
9543
9544   /* For everything else, return CCmode.  */
9545   return CCmode;
9546 }
9547
9548 static int
9549 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
9550
9551 int
9552 aarch64_get_condition_code (rtx x)
9553 {
9554   machine_mode mode = GET_MODE (XEXP (x, 0));
9555   enum rtx_code comp_code = GET_CODE (x);
9556
9557   if (GET_MODE_CLASS (mode) != MODE_CC)
9558     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
9559   return aarch64_get_condition_code_1 (mode, comp_code);
9560 }
9561
9562 static int
9563 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
9564 {
9565   switch (mode)
9566     {
9567     case E_CCFPmode:
9568     case E_CCFPEmode:
9569       switch (comp_code)
9570         {
9571         case GE: return AARCH64_GE;
9572         case GT: return AARCH64_GT;
9573         case LE: return AARCH64_LS;
9574         case LT: return AARCH64_MI;
9575         case NE: return AARCH64_NE;
9576         case EQ: return AARCH64_EQ;
9577         case ORDERED: return AARCH64_VC;
9578         case UNORDERED: return AARCH64_VS;
9579         case UNLT: return AARCH64_LT;
9580         case UNLE: return AARCH64_LE;
9581         case UNGT: return AARCH64_HI;
9582         case UNGE: return AARCH64_PL;
9583         default: return -1;
9584         }
9585       break;
9586
9587     case E_CCmode:
9588       switch (comp_code)
9589         {
9590         case NE: return AARCH64_NE;
9591         case EQ: return AARCH64_EQ;
9592         case GE: return AARCH64_GE;
9593         case GT: return AARCH64_GT;
9594         case LE: return AARCH64_LE;
9595         case LT: return AARCH64_LT;
9596         case GEU: return AARCH64_CS;
9597         case GTU: return AARCH64_HI;
9598         case LEU: return AARCH64_LS;
9599         case LTU: return AARCH64_CC;
9600         default: return -1;
9601         }
9602       break;
9603
9604     case E_CC_SWPmode:
9605       switch (comp_code)
9606         {
9607         case NE: return AARCH64_NE;
9608         case EQ: return AARCH64_EQ;
9609         case GE: return AARCH64_LE;
9610         case GT: return AARCH64_LT;
9611         case LE: return AARCH64_GE;
9612         case LT: return AARCH64_GT;
9613         case GEU: return AARCH64_LS;
9614         case GTU: return AARCH64_CC;
9615         case LEU: return AARCH64_CS;
9616         case LTU: return AARCH64_HI;
9617         default: return -1;
9618         }
9619       break;
9620
9621     case E_CC_NZCmode:
9622       switch (comp_code)
9623         {
9624         case NE: return AARCH64_NE; /* = any */
9625         case EQ: return AARCH64_EQ; /* = none */
9626         case GE: return AARCH64_PL; /* = nfrst */
9627         case LT: return AARCH64_MI; /* = first */
9628         case GEU: return AARCH64_CS; /* = nlast */
9629         case GTU: return AARCH64_HI; /* = pmore */
9630         case LEU: return AARCH64_LS; /* = plast */
9631         case LTU: return AARCH64_CC; /* = last */
9632         default: return -1;
9633         }
9634       break;
9635
9636     case E_CC_NZmode:
9637       switch (comp_code)
9638         {
9639         case NE: return AARCH64_NE;
9640         case EQ: return AARCH64_EQ;
9641         case GE: return AARCH64_PL;
9642         case LT: return AARCH64_MI;
9643         default: return -1;
9644         }
9645       break;
9646
9647     case E_CC_Zmode:
9648       switch (comp_code)
9649         {
9650         case NE: return AARCH64_NE;
9651         case EQ: return AARCH64_EQ;
9652         default: return -1;
9653         }
9654       break;
9655
9656     case E_CC_Cmode:
9657       switch (comp_code)
9658         {
9659         case LTU: return AARCH64_CS;
9660         case GEU: return AARCH64_CC;
9661         default: return -1;
9662         }
9663       break;
9664
9665     case E_CC_ADCmode:
9666       switch (comp_code)
9667         {
9668         case GEU: return AARCH64_CS;
9669         case LTU: return AARCH64_CC;
9670         default: return -1;
9671         }
9672       break;
9673
9674     case E_CC_Vmode:
9675       switch (comp_code)
9676         {
9677         case NE: return AARCH64_VS;
9678         case EQ: return AARCH64_VC;
9679         default: return -1;
9680         }
9681       break;
9682
9683     default:
9684       return -1;
9685     }
9686
9687   return -1;
9688 }
9689
9690 bool
9691 aarch64_const_vec_all_same_in_range_p (rtx x,
9692                                        HOST_WIDE_INT minval,
9693                                        HOST_WIDE_INT maxval)
9694 {
9695   rtx elt;
9696   return (const_vec_duplicate_p (x, &elt)
9697           && CONST_INT_P (elt)
9698           && IN_RANGE (INTVAL (elt), minval, maxval));
9699 }
9700
9701 bool
9702 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
9703 {
9704   return aarch64_const_vec_all_same_in_range_p (x, val, val);
9705 }
9706
9707 /* Return true if VEC is a constant in which every element is in the range
9708    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
9709
9710 static bool
9711 aarch64_const_vec_all_in_range_p (rtx vec,
9712                                   HOST_WIDE_INT minval,
9713                                   HOST_WIDE_INT maxval)
9714 {
9715   if (GET_CODE (vec) != CONST_VECTOR
9716       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
9717     return false;
9718
9719   int nunits;
9720   if (!CONST_VECTOR_STEPPED_P (vec))
9721     nunits = const_vector_encoded_nelts (vec);
9722   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
9723     return false;
9724
9725   for (int i = 0; i < nunits; i++)
9726     {
9727       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
9728       if (!CONST_INT_P (vec_elem)
9729           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
9730         return false;
9731     }
9732   return true;
9733 }
9734
9735 /* N Z C V.  */
9736 #define AARCH64_CC_V 1
9737 #define AARCH64_CC_C (1 << 1)
9738 #define AARCH64_CC_Z (1 << 2)
9739 #define AARCH64_CC_N (1 << 3)
9740
9741 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
9742 static const int aarch64_nzcv_codes[] =
9743 {
9744   0,            /* EQ, Z == 1.  */
9745   AARCH64_CC_Z, /* NE, Z == 0.  */
9746   0,            /* CS, C == 1.  */
9747   AARCH64_CC_C, /* CC, C == 0.  */
9748   0,            /* MI, N == 1.  */
9749   AARCH64_CC_N, /* PL, N == 0.  */
9750   0,            /* VS, V == 1.  */
9751   AARCH64_CC_V, /* VC, V == 0.  */
9752   0,            /* HI, C ==1 && Z == 0.  */
9753   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
9754   AARCH64_CC_V, /* GE, N == V.  */
9755   0,            /* LT, N != V.  */
9756   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
9757   0,            /* LE, !(Z == 0 && N == V).  */
9758   0,            /* AL, Any.  */
9759   0             /* NV, Any.  */
9760 };
9761
9762 /* Print floating-point vector immediate operand X to F, negating it
9763    first if NEGATE is true.  Return true on success, false if it isn't
9764    a constant we can handle.  */
9765
9766 static bool
9767 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
9768 {
9769   rtx elt;
9770
9771   if (!const_vec_duplicate_p (x, &elt))
9772     return false;
9773
9774   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
9775   if (negate)
9776     r = real_value_negate (&r);
9777
9778   /* Handle the SVE single-bit immediates specially, since they have a
9779      fixed form in the assembly syntax.  */
9780   if (real_equal (&r, &dconst0))
9781     asm_fprintf (f, "0.0");
9782   else if (real_equal (&r, &dconst2))
9783     asm_fprintf (f, "2.0");
9784   else if (real_equal (&r, &dconst1))
9785     asm_fprintf (f, "1.0");
9786   else if (real_equal (&r, &dconsthalf))
9787     asm_fprintf (f, "0.5");
9788   else
9789     {
9790       const int buf_size = 20;
9791       char float_buf[buf_size] = {'\0'};
9792       real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
9793                                 1, GET_MODE (elt));
9794       asm_fprintf (f, "%s", float_buf);
9795     }
9796
9797   return true;
9798 }
9799
9800 /* Return the equivalent letter for size.  */
9801 static char
9802 sizetochar (int size)
9803 {
9804   switch (size)
9805     {
9806     case 64: return 'd';
9807     case 32: return 's';
9808     case 16: return 'h';
9809     case 8 : return 'b';
9810     default: gcc_unreachable ();
9811     }
9812 }
9813
9814 /* Print operand X to file F in a target specific manner according to CODE.
9815    The acceptable formatting commands given by CODE are:
9816      'c':               An integer or symbol address without a preceding #
9817                         sign.
9818      'C':               Take the duplicated element in a vector constant
9819                         and print it in hex.
9820      'D':               Take the duplicated element in a vector constant
9821                         and print it as an unsigned integer, in decimal.
9822      'e':               Print the sign/zero-extend size as a character 8->b,
9823                         16->h, 32->w.  Can also be used for masks:
9824                         0xff->b, 0xffff->h, 0xffffffff->w.
9825      'I':               If the operand is a duplicated vector constant,
9826                         replace it with the duplicated scalar.  If the
9827                         operand is then a floating-point constant, replace
9828                         it with the integer bit representation.  Print the
9829                         transformed constant as a signed decimal number.
9830      'p':               Prints N such that 2^N == X (X must be power of 2 and
9831                         const int).
9832      'P':               Print the number of non-zero bits in X (a const_int).
9833      'H':               Print the higher numbered register of a pair (TImode)
9834                         of regs.
9835      'm':               Print a condition (eq, ne, etc).
9836      'M':               Same as 'm', but invert condition.
9837      'N':               Take the duplicated element in a vector constant
9838                         and print the negative of it in decimal.
9839      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
9840      'S/T/U/V':         Print a FP/SIMD register name for a register list.
9841                         The register printed is the FP/SIMD register name
9842                         of X + 0/1/2/3 for S/T/U/V.
9843      'R':               Print a scalar Integer/FP/SIMD register name + 1.
9844      'X':               Print bottom 16 bits of integer constant in hex.
9845      'w/x':             Print a general register name or the zero register
9846                         (32-bit or 64-bit).
9847      '0':               Print a normal operand, if it's a general register,
9848                         then we assume DImode.
9849      'k':               Print NZCV for conditional compare instructions.
9850      'A':               Output address constant representing the first
9851                         argument of X, specifying a relocation offset
9852                         if appropriate.
9853      'L':               Output constant address specified by X
9854                         with a relocation offset if appropriate.
9855      'G':               Prints address of X, specifying a PC relative
9856                         relocation mode if appropriate.
9857      'y':               Output address of LDP or STP - this is used for
9858                         some LDP/STPs which don't use a PARALLEL in their
9859                         pattern (so the mode needs to be adjusted).
9860      'z':               Output address of a typical LDP or STP.  */
9861
9862 static void
9863 aarch64_print_operand (FILE *f, rtx x, int code)
9864 {
9865   rtx elt;
9866   switch (code)
9867     {
9868     case 'c':
9869       switch (GET_CODE (x))
9870         {
9871         case CONST_INT:
9872           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
9873           break;
9874
9875         case SYMBOL_REF:
9876           output_addr_const (f, x);
9877           break;
9878
9879         case CONST:
9880           if (GET_CODE (XEXP (x, 0)) == PLUS
9881               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
9882             {
9883               output_addr_const (f, x);
9884               break;
9885             }
9886           /* Fall through.  */
9887
9888         default:
9889           output_operand_lossage ("unsupported operand for code '%c'", code);
9890         }
9891       break;
9892
9893     case 'e':
9894       {
9895         x = unwrap_const_vec_duplicate (x);
9896         if (!CONST_INT_P (x))
9897           {
9898             output_operand_lossage ("invalid operand for '%%%c'", code);
9899             return;
9900           }
9901
9902         HOST_WIDE_INT val = INTVAL (x);
9903         if ((val & ~7) == 8 || val == 0xff)
9904           fputc ('b', f);
9905         else if ((val & ~7) == 16 || val == 0xffff)
9906           fputc ('h', f);
9907         else if ((val & ~7) == 32 || val == 0xffffffff)
9908           fputc ('w', f);
9909         else
9910           {
9911             output_operand_lossage ("invalid operand for '%%%c'", code);
9912             return;
9913           }
9914       }
9915       break;
9916
9917     case 'p':
9918       {
9919         int n;
9920
9921         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
9922           {
9923             output_operand_lossage ("invalid operand for '%%%c'", code);
9924             return;
9925           }
9926
9927         asm_fprintf (f, "%d", n);
9928       }
9929       break;
9930
9931     case 'P':
9932       if (!CONST_INT_P (x))
9933         {
9934           output_operand_lossage ("invalid operand for '%%%c'", code);
9935           return;
9936         }
9937
9938       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
9939       break;
9940
9941     case 'H':
9942       if (x == const0_rtx)
9943         {
9944           asm_fprintf (f, "xzr");
9945           break;
9946         }
9947
9948       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
9949         {
9950           output_operand_lossage ("invalid operand for '%%%c'", code);
9951           return;
9952         }
9953
9954       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
9955       break;
9956
9957     case 'I':
9958       {
9959         x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
9960         if (CONST_INT_P (x))
9961           asm_fprintf (f, "%wd", INTVAL (x));
9962         else
9963           {
9964             output_operand_lossage ("invalid operand for '%%%c'", code);
9965             return;
9966           }
9967         break;
9968       }
9969
9970     case 'M':
9971     case 'm':
9972       {
9973         int cond_code;
9974         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
9975         if (x == const_true_rtx)
9976           {
9977             if (code == 'M')
9978               fputs ("nv", f);
9979             return;
9980           }
9981
9982         if (!COMPARISON_P (x))
9983           {
9984             output_operand_lossage ("invalid operand for '%%%c'", code);
9985             return;
9986           }
9987
9988         cond_code = aarch64_get_condition_code (x);
9989         gcc_assert (cond_code >= 0);
9990         if (code == 'M')
9991           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
9992         if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
9993           fputs (aarch64_sve_condition_codes[cond_code], f);
9994         else
9995           fputs (aarch64_condition_codes[cond_code], f);
9996       }
9997       break;
9998
9999     case 'N':
10000       if (!const_vec_duplicate_p (x, &elt))
10001         {
10002           output_operand_lossage ("invalid vector constant");
10003           return;
10004         }
10005
10006       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
10007         asm_fprintf (f, "%wd", -INTVAL (elt));
10008       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10009                && aarch64_print_vector_float_operand (f, x, true))
10010         ;
10011       else
10012         {
10013           output_operand_lossage ("invalid vector constant");
10014           return;
10015         }
10016       break;
10017
10018     case 'b':
10019     case 'h':
10020     case 's':
10021     case 'd':
10022     case 'q':
10023       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
10024         {
10025           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
10026           return;
10027         }
10028       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
10029       break;
10030
10031     case 'S':
10032     case 'T':
10033     case 'U':
10034     case 'V':
10035       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
10036         {
10037           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
10038           return;
10039         }
10040       asm_fprintf (f, "%c%d",
10041                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
10042                    REGNO (x) - V0_REGNUM + (code - 'S'));
10043       break;
10044
10045     case 'R':
10046       if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
10047         asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
10048       else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
10049         asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
10050       else
10051         output_operand_lossage ("incompatible register operand for '%%%c'",
10052                                 code);
10053       break;
10054
10055     case 'X':
10056       if (!CONST_INT_P (x))
10057         {
10058           output_operand_lossage ("invalid operand for '%%%c'", code);
10059           return;
10060         }
10061       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
10062       break;
10063
10064     case 'C':
10065       {
10066         /* Print a replicated constant in hex.  */
10067         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
10068           {
10069             output_operand_lossage ("invalid operand for '%%%c'", code);
10070             return;
10071           }
10072         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
10073         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
10074       }
10075       break;
10076
10077     case 'D':
10078       {
10079         /* Print a replicated constant in decimal, treating it as
10080            unsigned.  */
10081         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
10082           {
10083             output_operand_lossage ("invalid operand for '%%%c'", code);
10084             return;
10085           }
10086         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
10087         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
10088       }
10089       break;
10090
10091     case 'w':
10092     case 'x':
10093       if (x == const0_rtx
10094           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
10095         {
10096           asm_fprintf (f, "%czr", code);
10097           break;
10098         }
10099
10100       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
10101         {
10102           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
10103           break;
10104         }
10105
10106       if (REG_P (x) && REGNO (x) == SP_REGNUM)
10107         {
10108           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
10109           break;
10110         }
10111
10112       /* Fall through */
10113
10114     case 0:
10115       if (x == NULL)
10116         {
10117           output_operand_lossage ("missing operand");
10118           return;
10119         }
10120
10121       switch (GET_CODE (x))
10122         {
10123         case REG:
10124           if (aarch64_sve_data_mode_p (GET_MODE (x)))
10125             {
10126               if (REG_NREGS (x) == 1)
10127                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
10128               else
10129                 {
10130                   char suffix
10131                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
10132                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
10133                                REGNO (x) - V0_REGNUM, suffix,
10134                                END_REGNO (x) - V0_REGNUM - 1, suffix);
10135                 }
10136             }
10137           else
10138             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
10139           break;
10140
10141         case MEM:
10142           output_address (GET_MODE (x), XEXP (x, 0));
10143           break;
10144
10145         case LABEL_REF:
10146         case SYMBOL_REF:
10147           output_addr_const (asm_out_file, x);
10148           break;
10149
10150         case CONST_INT:
10151           asm_fprintf (f, "%wd", INTVAL (x));
10152           break;
10153
10154         case CONST:
10155           if (!VECTOR_MODE_P (GET_MODE (x)))
10156             {
10157               output_addr_const (asm_out_file, x);
10158               break;
10159             }
10160           /* fall through */
10161
10162         case CONST_VECTOR:
10163           if (!const_vec_duplicate_p (x, &elt))
10164             {
10165               output_operand_lossage ("invalid vector constant");
10166               return;
10167             }
10168
10169           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
10170             asm_fprintf (f, "%wd", INTVAL (elt));
10171           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10172                    && aarch64_print_vector_float_operand (f, x, false))
10173             ;
10174           else
10175             {
10176               output_operand_lossage ("invalid vector constant");
10177               return;
10178             }
10179           break;
10180
10181         case CONST_DOUBLE:
10182           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
10183              be getting CONST_DOUBLEs holding integers.  */
10184           gcc_assert (GET_MODE (x) != VOIDmode);
10185           if (aarch64_float_const_zero_rtx_p (x))
10186             {
10187               fputc ('0', f);
10188               break;
10189             }
10190           else if (aarch64_float_const_representable_p (x))
10191             {
10192 #define buf_size 20
10193               char float_buf[buf_size] = {'\0'};
10194               real_to_decimal_for_mode (float_buf,
10195                                         CONST_DOUBLE_REAL_VALUE (x),
10196                                         buf_size, buf_size,
10197                                         1, GET_MODE (x));
10198               asm_fprintf (asm_out_file, "%s", float_buf);
10199               break;
10200 #undef buf_size
10201             }
10202           output_operand_lossage ("invalid constant");
10203           return;
10204         default:
10205           output_operand_lossage ("invalid operand");
10206           return;
10207         }
10208       break;
10209
10210     case 'A':
10211       if (GET_CODE (x) == HIGH)
10212         x = XEXP (x, 0);
10213
10214       switch (aarch64_classify_symbolic_expression (x))
10215         {
10216         case SYMBOL_SMALL_GOT_4G:
10217           asm_fprintf (asm_out_file, ":got:");
10218           break;
10219
10220         case SYMBOL_SMALL_TLSGD:
10221           asm_fprintf (asm_out_file, ":tlsgd:");
10222           break;
10223
10224         case SYMBOL_SMALL_TLSDESC:
10225           asm_fprintf (asm_out_file, ":tlsdesc:");
10226           break;
10227
10228         case SYMBOL_SMALL_TLSIE:
10229           asm_fprintf (asm_out_file, ":gottprel:");
10230           break;
10231
10232         case SYMBOL_TLSLE24:
10233           asm_fprintf (asm_out_file, ":tprel:");
10234           break;
10235
10236         case SYMBOL_TINY_GOT:
10237           gcc_unreachable ();
10238           break;
10239
10240         default:
10241           break;
10242         }
10243       output_addr_const (asm_out_file, x);
10244       break;
10245
10246     case 'L':
10247       switch (aarch64_classify_symbolic_expression (x))
10248         {
10249         case SYMBOL_SMALL_GOT_4G:
10250           asm_fprintf (asm_out_file, ":lo12:");
10251           break;
10252
10253         case SYMBOL_SMALL_TLSGD:
10254           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
10255           break;
10256
10257         case SYMBOL_SMALL_TLSDESC:
10258           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
10259           break;
10260
10261         case SYMBOL_SMALL_TLSIE:
10262           asm_fprintf (asm_out_file, ":gottprel_lo12:");
10263           break;
10264
10265         case SYMBOL_TLSLE12:
10266           asm_fprintf (asm_out_file, ":tprel_lo12:");
10267           break;
10268
10269         case SYMBOL_TLSLE24:
10270           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
10271           break;
10272
10273         case SYMBOL_TINY_GOT:
10274           asm_fprintf (asm_out_file, ":got:");
10275           break;
10276
10277         case SYMBOL_TINY_TLSIE:
10278           asm_fprintf (asm_out_file, ":gottprel:");
10279           break;
10280
10281         default:
10282           break;
10283         }
10284       output_addr_const (asm_out_file, x);
10285       break;
10286
10287     case 'G':
10288       switch (aarch64_classify_symbolic_expression (x))
10289         {
10290         case SYMBOL_TLSLE24:
10291           asm_fprintf (asm_out_file, ":tprel_hi12:");
10292           break;
10293         default:
10294           break;
10295         }
10296       output_addr_const (asm_out_file, x);
10297       break;
10298
10299     case 'k':
10300       {
10301         HOST_WIDE_INT cond_code;
10302
10303         if (!CONST_INT_P (x))
10304           {
10305             output_operand_lossage ("invalid operand for '%%%c'", code);
10306             return;
10307           }
10308
10309         cond_code = INTVAL (x);
10310         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
10311         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
10312       }
10313       break;
10314
10315     case 'y':
10316     case 'z':
10317       {
10318         machine_mode mode = GET_MODE (x);
10319
10320         if (GET_CODE (x) != MEM
10321             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
10322           {
10323             output_operand_lossage ("invalid operand for '%%%c'", code);
10324             return;
10325           }
10326
10327         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
10328                                             code == 'y'
10329                                             ? ADDR_QUERY_LDP_STP_N
10330                                             : ADDR_QUERY_LDP_STP))
10331           output_operand_lossage ("invalid operand prefix '%%%c'", code);
10332       }
10333       break;
10334
10335     default:
10336       output_operand_lossage ("invalid operand prefix '%%%c'", code);
10337       return;
10338     }
10339 }
10340
10341 /* Print address 'x' of a memory access with mode 'mode'.
10342    'op' is the context required by aarch64_classify_address.  It can either be
10343    MEM for a normal memory access or PARALLEL for LDP/STP.  */
10344 static bool
10345 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
10346                                 aarch64_addr_query_type type)
10347 {
10348   struct aarch64_address_info addr;
10349   unsigned int size, vec_flags;
10350
10351   /* Check all addresses are Pmode - including ILP32.  */
10352   if (GET_MODE (x) != Pmode
10353       && (!CONST_INT_P (x)
10354           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
10355     {
10356       output_operand_lossage ("invalid address mode");
10357       return false;
10358     }
10359
10360   if (aarch64_classify_address (&addr, x, mode, true, type))
10361     switch (addr.type)
10362       {
10363       case ADDRESS_REG_IMM:
10364         if (known_eq (addr.const_offset, 0))
10365           {
10366             asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
10367             return true;
10368           }
10369
10370         vec_flags = aarch64_classify_vector_mode (mode);
10371         if (vec_flags & VEC_ANY_SVE)
10372           {
10373             HOST_WIDE_INT vnum
10374               = exact_div (addr.const_offset,
10375                            aarch64_vl_bytes (mode, vec_flags)).to_constant ();
10376             asm_fprintf (f, "[%s, #%wd, mul vl]",
10377                          reg_names[REGNO (addr.base)], vnum);
10378             return true;
10379           }
10380
10381         asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
10382                      INTVAL (addr.offset));
10383         return true;
10384
10385       case ADDRESS_REG_REG:
10386         if (addr.shift == 0)
10387           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
10388                        reg_names [REGNO (addr.offset)]);
10389         else
10390           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
10391                        reg_names [REGNO (addr.offset)], addr.shift);
10392         return true;
10393
10394       case ADDRESS_REG_UXTW:
10395         if (addr.shift == 0)
10396           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
10397                        REGNO (addr.offset) - R0_REGNUM);
10398         else
10399           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
10400                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
10401         return true;
10402
10403       case ADDRESS_REG_SXTW:
10404         if (addr.shift == 0)
10405           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
10406                        REGNO (addr.offset) - R0_REGNUM);
10407         else
10408           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
10409                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
10410         return true;
10411
10412       case ADDRESS_REG_WB:
10413         /* Writeback is only supported for fixed-width modes.  */
10414         size = GET_MODE_SIZE (mode).to_constant ();
10415         switch (GET_CODE (x))
10416           {
10417           case PRE_INC:
10418             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
10419             return true;
10420           case POST_INC:
10421             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
10422             return true;
10423           case PRE_DEC:
10424             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
10425             return true;
10426           case POST_DEC:
10427             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
10428             return true;
10429           case PRE_MODIFY:
10430             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
10431                          INTVAL (addr.offset));
10432             return true;
10433           case POST_MODIFY:
10434             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
10435                          INTVAL (addr.offset));
10436             return true;
10437           default:
10438             break;
10439           }
10440         break;
10441
10442       case ADDRESS_LO_SUM:
10443         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
10444         output_addr_const (f, addr.offset);
10445         asm_fprintf (f, "]");
10446         return true;
10447
10448       case ADDRESS_SYMBOLIC:
10449         output_addr_const (f, x);
10450         return true;
10451       }
10452
10453   return false;
10454 }
10455
10456 /* Print address 'x' of a memory access with mode 'mode'.  */
10457 static void
10458 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
10459 {
10460   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
10461     output_addr_const (f, x);
10462 }
10463
10464 bool
10465 aarch64_label_mentioned_p (rtx x)
10466 {
10467   const char *fmt;
10468   int i;
10469
10470   if (GET_CODE (x) == LABEL_REF)
10471     return true;
10472
10473   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
10474      referencing instruction, but they are constant offsets, not
10475      symbols.  */
10476   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
10477     return false;
10478
10479   fmt = GET_RTX_FORMAT (GET_CODE (x));
10480   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
10481     {
10482       if (fmt[i] == 'E')
10483         {
10484           int j;
10485
10486           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
10487             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
10488               return 1;
10489         }
10490       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
10491         return 1;
10492     }
10493
10494   return 0;
10495 }
10496
10497 /* Implement REGNO_REG_CLASS.  */
10498
10499 enum reg_class
10500 aarch64_regno_regclass (unsigned regno)
10501 {
10502   if (GP_REGNUM_P (regno))
10503     return GENERAL_REGS;
10504
10505   if (regno == SP_REGNUM)
10506     return STACK_REG;
10507
10508   if (regno == FRAME_POINTER_REGNUM
10509       || regno == ARG_POINTER_REGNUM)
10510     return POINTER_REGS;
10511
10512   if (FP_REGNUM_P (regno))
10513     return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
10514             : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
10515
10516   if (PR_REGNUM_P (regno))
10517     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
10518
10519   if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
10520     return FFR_REGS;
10521
10522   return NO_REGS;
10523 }
10524
10525 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
10526    If OFFSET is out of range, return an offset of an anchor point
10527    that is in range.  Return 0 otherwise.  */
10528
10529 static HOST_WIDE_INT
10530 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
10531                        machine_mode mode)
10532 {
10533   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
10534   if (size > 16)
10535     return (offset + 0x400) & ~0x7f0;
10536
10537   /* For offsets that aren't a multiple of the access size, the limit is
10538      -256...255.  */
10539   if (offset & (size - 1))
10540     {
10541       /* BLKmode typically uses LDP of X-registers.  */
10542       if (mode == BLKmode)
10543         return (offset + 512) & ~0x3ff;
10544       return (offset + 0x100) & ~0x1ff;
10545     }
10546
10547   /* Small negative offsets are supported.  */
10548   if (IN_RANGE (offset, -256, 0))
10549     return 0;
10550
10551   if (mode == TImode || mode == TFmode)
10552     return (offset + 0x100) & ~0x1ff;
10553
10554   /* Use 12-bit offset by access size.  */
10555   return offset & (~0xfff * size);
10556 }
10557
10558 static rtx
10559 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
10560 {
10561   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
10562      where mask is selected by alignment and size of the offset.
10563      We try to pick as large a range for the offset as possible to
10564      maximize the chance of a CSE.  However, for aligned addresses
10565      we limit the range to 4k so that structures with different sized
10566      elements are likely to use the same base.  We need to be careful
10567      not to split a CONST for some forms of address expression, otherwise
10568      it will generate sub-optimal code.  */
10569
10570   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
10571     {
10572       rtx base = XEXP (x, 0);
10573       rtx offset_rtx = XEXP (x, 1);
10574       HOST_WIDE_INT offset = INTVAL (offset_rtx);
10575
10576       if (GET_CODE (base) == PLUS)
10577         {
10578           rtx op0 = XEXP (base, 0);
10579           rtx op1 = XEXP (base, 1);
10580
10581           /* Force any scaling into a temp for CSE.  */
10582           op0 = force_reg (Pmode, op0);
10583           op1 = force_reg (Pmode, op1);
10584
10585           /* Let the pointer register be in op0.  */
10586           if (REG_POINTER (op1))
10587             std::swap (op0, op1);
10588
10589           /* If the pointer is virtual or frame related, then we know that
10590              virtual register instantiation or register elimination is going
10591              to apply a second constant.  We want the two constants folded
10592              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
10593           if (virt_or_elim_regno_p (REGNO (op0)))
10594             {
10595               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
10596                                    NULL_RTX, true, OPTAB_DIRECT);
10597               return gen_rtx_PLUS (Pmode, base, op1);
10598             }
10599
10600           /* Otherwise, in order to encourage CSE (and thence loop strength
10601              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
10602           base = expand_binop (Pmode, add_optab, op0, op1,
10603                                NULL_RTX, true, OPTAB_DIRECT);
10604           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
10605         }
10606
10607       HOST_WIDE_INT size;
10608       if (GET_MODE_SIZE (mode).is_constant (&size))
10609         {
10610           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
10611                                                              mode);
10612           if (base_offset != 0)
10613             {
10614               base = plus_constant (Pmode, base, base_offset);
10615               base = force_operand (base, NULL_RTX);
10616               return plus_constant (Pmode, base, offset - base_offset);
10617             }
10618         }
10619     }
10620
10621   return x;
10622 }
10623
10624 static reg_class_t
10625 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
10626                           reg_class_t rclass,
10627                           machine_mode mode,
10628                           secondary_reload_info *sri)
10629 {
10630   /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
10631      LDR and STR.  See the comment at the head of aarch64-sve.md for
10632      more details about the big-endian handling.  */
10633   if (reg_class_subset_p (rclass, FP_REGS)
10634       && !((REG_P (x) && HARD_REGISTER_P (x))
10635            || aarch64_simd_valid_immediate (x, NULL))
10636       && mode != VNx16QImode)
10637     {
10638       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10639       if ((vec_flags & VEC_SVE_DATA)
10640           && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
10641         {
10642           sri->icode = CODE_FOR_aarch64_sve_reload_mem;
10643           return NO_REGS;
10644         }
10645     }
10646
10647   /* If we have to disable direct literal pool loads and stores because the
10648      function is too big, then we need a scratch register.  */
10649   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
10650       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
10651           || targetm.vector_mode_supported_p (GET_MODE (x)))
10652       && !aarch64_pcrelative_literal_loads)
10653     {
10654       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
10655       return NO_REGS;
10656     }
10657
10658   /* Without the TARGET_SIMD instructions we cannot move a Q register
10659      to a Q register directly.  We need a scratch.  */
10660   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
10661       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
10662       && reg_class_subset_p (rclass, FP_REGS))
10663     {
10664       sri->icode = code_for_aarch64_reload_mov (mode);
10665       return NO_REGS;
10666     }
10667
10668   /* A TFmode or TImode memory access should be handled via an FP_REGS
10669      because AArch64 has richer addressing modes for LDR/STR instructions
10670      than LDP/STP instructions.  */
10671   if (TARGET_FLOAT && rclass == GENERAL_REGS
10672       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
10673     return FP_REGS;
10674
10675   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
10676       return GENERAL_REGS;
10677
10678   return NO_REGS;
10679 }
10680
10681 static bool
10682 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
10683 {
10684   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
10685
10686   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
10687      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
10688   if (frame_pointer_needed)
10689     return to == HARD_FRAME_POINTER_REGNUM;
10690   return true;
10691 }
10692
10693 poly_int64
10694 aarch64_initial_elimination_offset (unsigned from, unsigned to)
10695 {
10696   if (to == HARD_FRAME_POINTER_REGNUM)
10697     {
10698       if (from == ARG_POINTER_REGNUM)
10699         return cfun->machine->frame.hard_fp_offset;
10700
10701       if (from == FRAME_POINTER_REGNUM)
10702         return cfun->machine->frame.hard_fp_offset
10703                - cfun->machine->frame.locals_offset;
10704     }
10705
10706   if (to == STACK_POINTER_REGNUM)
10707     {
10708       if (from == FRAME_POINTER_REGNUM)
10709           return cfun->machine->frame.frame_size
10710                  - cfun->machine->frame.locals_offset;
10711     }
10712
10713   return cfun->machine->frame.frame_size;
10714 }
10715
10716 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
10717    previous frame.  */
10718
10719 rtx
10720 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
10721 {
10722   if (count != 0)
10723     return const0_rtx;
10724   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
10725 }
10726
10727
10728 static void
10729 aarch64_asm_trampoline_template (FILE *f)
10730 {
10731   int offset1 = 16;
10732   int offset2 = 20;
10733
10734   if (aarch64_bti_enabled ())
10735     {
10736       asm_fprintf (f, "\thint\t34 // bti c\n");
10737       offset1 -= 4;
10738       offset2 -= 4;
10739     }
10740
10741   if (TARGET_ILP32)
10742     {
10743       asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
10744       asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
10745                    offset1);
10746     }
10747   else
10748     {
10749       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
10750       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
10751                    offset2);
10752     }
10753   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
10754
10755   /* The trampoline needs an extra padding instruction.  In case if BTI is
10756      enabled the padding instruction is replaced by the BTI instruction at
10757      the beginning.  */
10758   if (!aarch64_bti_enabled ())
10759     assemble_aligned_integer (4, const0_rtx);
10760
10761   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
10762   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
10763 }
10764
10765 static void
10766 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
10767 {
10768   rtx fnaddr, mem, a_tramp;
10769   const int tramp_code_sz = 16;
10770
10771   /* Don't need to copy the trailing D-words, we fill those in below.  */
10772   emit_block_move (m_tramp, assemble_trampoline_template (),
10773                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
10774   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
10775   fnaddr = XEXP (DECL_RTL (fndecl), 0);
10776   if (GET_MODE (fnaddr) != ptr_mode)
10777     fnaddr = convert_memory_address (ptr_mode, fnaddr);
10778   emit_move_insn (mem, fnaddr);
10779
10780   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
10781   emit_move_insn (mem, chain_value);
10782
10783   /* XXX We should really define a "clear_cache" pattern and use
10784      gen_clear_cache().  */
10785   a_tramp = XEXP (m_tramp, 0);
10786   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
10787                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
10788                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
10789                      ptr_mode);
10790 }
10791
10792 static unsigned char
10793 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
10794 {
10795   /* ??? Logically we should only need to provide a value when
10796      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
10797      can hold MODE, but at the moment we need to handle all modes.
10798      Just ignore any runtime parts for registers that can't store them.  */
10799   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
10800   unsigned int nregs, vec_flags;
10801   switch (regclass)
10802     {
10803     case TAILCALL_ADDR_REGS:
10804     case POINTER_REGS:
10805     case GENERAL_REGS:
10806     case ALL_REGS:
10807     case POINTER_AND_FP_REGS:
10808     case FP_REGS:
10809     case FP_LO_REGS:
10810     case FP_LO8_REGS:
10811       vec_flags = aarch64_classify_vector_mode (mode);
10812       if ((vec_flags & VEC_SVE_DATA)
10813           && constant_multiple_p (GET_MODE_SIZE (mode),
10814                                   aarch64_vl_bytes (mode, vec_flags), &nregs))
10815         return nregs;
10816       return (vec_flags & VEC_ADVSIMD
10817               ? CEIL (lowest_size, UNITS_PER_VREG)
10818               : CEIL (lowest_size, UNITS_PER_WORD));
10819     case STACK_REG:
10820     case PR_REGS:
10821     case PR_LO_REGS:
10822     case PR_HI_REGS:
10823     case FFR_REGS:
10824     case PR_AND_FFR_REGS:
10825       return 1;
10826
10827     case NO_REGS:
10828       return 0;
10829
10830     default:
10831       break;
10832     }
10833   gcc_unreachable ();
10834 }
10835
10836 static reg_class_t
10837 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
10838 {
10839   if (regclass == POINTER_REGS)
10840     return GENERAL_REGS;
10841
10842   if (regclass == STACK_REG)
10843     {
10844       if (REG_P(x)
10845           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
10846           return regclass;
10847
10848       return NO_REGS;
10849     }
10850
10851   /* Register eliminiation can result in a request for
10852      SP+constant->FP_REGS.  We cannot support such operations which
10853      use SP as source and an FP_REG as destination, so reject out
10854      right now.  */
10855   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
10856     {
10857       rtx lhs = XEXP (x, 0);
10858
10859       /* Look through a possible SUBREG introduced by ILP32.  */
10860       if (GET_CODE (lhs) == SUBREG)
10861         lhs = SUBREG_REG (lhs);
10862
10863       gcc_assert (REG_P (lhs));
10864       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
10865                                       POINTER_REGS));
10866       return NO_REGS;
10867     }
10868
10869   return regclass;
10870 }
10871
10872 void
10873 aarch64_asm_output_labelref (FILE* f, const char *name)
10874 {
10875   asm_fprintf (f, "%U%s", name);
10876 }
10877
10878 static void
10879 aarch64_elf_asm_constructor (rtx symbol, int priority)
10880 {
10881   if (priority == DEFAULT_INIT_PRIORITY)
10882     default_ctor_section_asm_out_constructor (symbol, priority);
10883   else
10884     {
10885       section *s;
10886       /* While priority is known to be in range [0, 65535], so 18 bytes
10887          would be enough, the compiler might not know that.  To avoid
10888          -Wformat-truncation false positive, use a larger size.  */
10889       char buf[23];
10890       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
10891       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
10892       switch_to_section (s);
10893       assemble_align (POINTER_SIZE);
10894       assemble_aligned_integer (POINTER_BYTES, symbol);
10895     }
10896 }
10897
10898 static void
10899 aarch64_elf_asm_destructor (rtx symbol, int priority)
10900 {
10901   if (priority == DEFAULT_INIT_PRIORITY)
10902     default_dtor_section_asm_out_destructor (symbol, priority);
10903   else
10904     {
10905       section *s;
10906       /* While priority is known to be in range [0, 65535], so 18 bytes
10907          would be enough, the compiler might not know that.  To avoid
10908          -Wformat-truncation false positive, use a larger size.  */
10909       char buf[23];
10910       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
10911       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
10912       switch_to_section (s);
10913       assemble_align (POINTER_SIZE);
10914       assemble_aligned_integer (POINTER_BYTES, symbol);
10915     }
10916 }
10917
10918 const char*
10919 aarch64_output_casesi (rtx *operands)
10920 {
10921   char buf[100];
10922   char label[100];
10923   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
10924   int index;
10925   static const char *const patterns[4][2] =
10926   {
10927     {
10928       "ldrb\t%w3, [%0,%w1,uxtw]",
10929       "add\t%3, %4, %w3, sxtb #2"
10930     },
10931     {
10932       "ldrh\t%w3, [%0,%w1,uxtw #1]",
10933       "add\t%3, %4, %w3, sxth #2"
10934     },
10935     {
10936       "ldr\t%w3, [%0,%w1,uxtw #2]",
10937       "add\t%3, %4, %w3, sxtw #2"
10938     },
10939     /* We assume that DImode is only generated when not optimizing and
10940        that we don't really need 64-bit address offsets.  That would
10941        imply an object file with 8GB of code in a single function!  */
10942     {
10943       "ldr\t%w3, [%0,%w1,uxtw #2]",
10944       "add\t%3, %4, %w3, sxtw #2"
10945     }
10946   };
10947
10948   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
10949
10950   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
10951   index = exact_log2 (GET_MODE_SIZE (mode));
10952
10953   gcc_assert (index >= 0 && index <= 3);
10954
10955   /* Need to implement table size reduction, by chaning the code below.  */
10956   output_asm_insn (patterns[index][0], operands);
10957   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
10958   snprintf (buf, sizeof (buf),
10959             "adr\t%%4, %s", targetm.strip_name_encoding (label));
10960   output_asm_insn (buf, operands);
10961   output_asm_insn (patterns[index][1], operands);
10962   output_asm_insn ("br\t%3", operands);
10963   assemble_label (asm_out_file, label);
10964   return "";
10965 }
10966
10967
10968 /* Return size in bits of an arithmetic operand which is shifted/scaled and
10969    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
10970    operator.  */
10971
10972 int
10973 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
10974 {
10975   if (shift >= 0 && shift <= 3)
10976     {
10977       int size;
10978       for (size = 8; size <= 32; size *= 2)
10979         {
10980           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
10981           if (mask == bits << shift)
10982             return size;
10983         }
10984     }
10985   return 0;
10986 }
10987
10988 /* Constant pools are per function only when PC relative
10989    literal loads are true or we are in the large memory
10990    model.  */
10991
10992 static inline bool
10993 aarch64_can_use_per_function_literal_pools_p (void)
10994 {
10995   return (aarch64_pcrelative_literal_loads
10996           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
10997 }
10998
10999 static bool
11000 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
11001 {
11002   /* We can't use blocks for constants when we're using a per-function
11003      constant pool.  */
11004   return !aarch64_can_use_per_function_literal_pools_p ();
11005 }
11006
11007 /* Select appropriate section for constants depending
11008    on where we place literal pools.  */
11009
11010 static section *
11011 aarch64_select_rtx_section (machine_mode mode,
11012                             rtx x,
11013                             unsigned HOST_WIDE_INT align)
11014 {
11015   if (aarch64_can_use_per_function_literal_pools_p ())
11016     return function_section (current_function_decl);
11017
11018   return default_elf_select_rtx_section (mode, x, align);
11019 }
11020
11021 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
11022 void
11023 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
11024                                   HOST_WIDE_INT offset)
11025 {
11026   /* When using per-function literal pools, we must ensure that any code
11027      section is aligned to the minimal instruction length, lest we get
11028      errors from the assembler re "unaligned instructions".  */
11029   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
11030     ASM_OUTPUT_ALIGN (f, 2);
11031 }
11032
11033 /* Costs.  */
11034
11035 /* Helper function for rtx cost calculation.  Strip a shift expression
11036    from X.  Returns the inner operand if successful, or the original
11037    expression on failure.  */
11038 static rtx
11039 aarch64_strip_shift (rtx x)
11040 {
11041   rtx op = x;
11042
11043   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
11044      we can convert both to ROR during final output.  */
11045   if ((GET_CODE (op) == ASHIFT
11046        || GET_CODE (op) == ASHIFTRT
11047        || GET_CODE (op) == LSHIFTRT
11048        || GET_CODE (op) == ROTATERT
11049        || GET_CODE (op) == ROTATE)
11050       && CONST_INT_P (XEXP (op, 1)))
11051     return XEXP (op, 0);
11052
11053   if (GET_CODE (op) == MULT
11054       && CONST_INT_P (XEXP (op, 1))
11055       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
11056     return XEXP (op, 0);
11057
11058   return x;
11059 }
11060
11061 /* Helper function for rtx cost calculation.  Strip an extend
11062    expression from X.  Returns the inner operand if successful, or the
11063    original expression on failure.  We deal with a number of possible
11064    canonicalization variations here. If STRIP_SHIFT is true, then
11065    we can strip off a shift also.  */
11066 static rtx
11067 aarch64_strip_extend (rtx x, bool strip_shift)
11068 {
11069   scalar_int_mode mode;
11070   rtx op = x;
11071
11072   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
11073     return op;
11074
11075   /* Zero and sign extraction of a widened value.  */
11076   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
11077       && XEXP (op, 2) == const0_rtx
11078       && GET_CODE (XEXP (op, 0)) == MULT
11079       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
11080                                          XEXP (op, 1)))
11081     return XEXP (XEXP (op, 0), 0);
11082
11083   /* It can also be represented (for zero-extend) as an AND with an
11084      immediate.  */
11085   if (GET_CODE (op) == AND
11086       && GET_CODE (XEXP (op, 0)) == MULT
11087       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
11088       && CONST_INT_P (XEXP (op, 1))
11089       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
11090                            INTVAL (XEXP (op, 1))) != 0)
11091     return XEXP (XEXP (op, 0), 0);
11092
11093   /* Now handle extended register, as this may also have an optional
11094      left shift by 1..4.  */
11095   if (strip_shift
11096       && GET_CODE (op) == ASHIFT
11097       && CONST_INT_P (XEXP (op, 1))
11098       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
11099     op = XEXP (op, 0);
11100
11101   if (GET_CODE (op) == ZERO_EXTEND
11102       || GET_CODE (op) == SIGN_EXTEND)
11103     op = XEXP (op, 0);
11104
11105   if (op != x)
11106     return op;
11107
11108   return x;
11109 }
11110
11111 /* Return true iff CODE is a shift supported in combination
11112    with arithmetic instructions.  */
11113
11114 static bool
11115 aarch64_shift_p (enum rtx_code code)
11116 {
11117   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
11118 }
11119
11120
11121 /* Return true iff X is a cheap shift without a sign extend. */
11122
11123 static bool
11124 aarch64_cheap_mult_shift_p (rtx x)
11125 {
11126   rtx op0, op1;
11127
11128   op0 = XEXP (x, 0);
11129   op1 = XEXP (x, 1);
11130
11131   if (!(aarch64_tune_params.extra_tuning_flags
11132                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
11133     return false;
11134
11135   if (GET_CODE (op0) == SIGN_EXTEND)
11136     return false;
11137
11138   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
11139       && UINTVAL (op1) <= 4)
11140     return true;
11141
11142   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
11143     return false;
11144
11145   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
11146
11147   if (l2 > 0 && l2 <= 4)
11148     return true;
11149
11150   return false;
11151 }
11152
11153 /* Helper function for rtx cost calculation.  Calculate the cost of
11154    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
11155    Return the calculated cost of the expression, recursing manually in to
11156    operands where needed.  */
11157
11158 static int
11159 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
11160 {
11161   rtx op0, op1;
11162   const struct cpu_cost_table *extra_cost
11163     = aarch64_tune_params.insn_extra_cost;
11164   int cost = 0;
11165   bool compound_p = (outer == PLUS || outer == MINUS);
11166   machine_mode mode = GET_MODE (x);
11167
11168   gcc_checking_assert (code == MULT);
11169
11170   op0 = XEXP (x, 0);
11171   op1 = XEXP (x, 1);
11172
11173   if (VECTOR_MODE_P (mode))
11174     mode = GET_MODE_INNER (mode);
11175
11176   /* Integer multiply/fma.  */
11177   if (GET_MODE_CLASS (mode) == MODE_INT)
11178     {
11179       /* The multiply will be canonicalized as a shift, cost it as such.  */
11180       if (aarch64_shift_p (GET_CODE (x))
11181           || (CONST_INT_P (op1)
11182               && exact_log2 (INTVAL (op1)) > 0))
11183         {
11184           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
11185                            || GET_CODE (op0) == SIGN_EXTEND;
11186           if (speed)
11187             {
11188               if (compound_p)
11189                 {
11190                   /* If the shift is considered cheap,
11191                      then don't add any cost. */
11192                   if (aarch64_cheap_mult_shift_p (x))
11193                     ;
11194                   else if (REG_P (op1))
11195                     /* ARITH + shift-by-register.  */
11196                     cost += extra_cost->alu.arith_shift_reg;
11197                   else if (is_extend)
11198                     /* ARITH + extended register.  We don't have a cost field
11199                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
11200                     cost += extra_cost->alu.extend_arith;
11201                   else
11202                     /* ARITH + shift-by-immediate.  */
11203                     cost += extra_cost->alu.arith_shift;
11204                 }
11205               else
11206                 /* LSL (immediate).  */
11207                 cost += extra_cost->alu.shift;
11208
11209             }
11210           /* Strip extends as we will have costed them in the case above.  */
11211           if (is_extend)
11212             op0 = aarch64_strip_extend (op0, true);
11213
11214           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
11215
11216           return cost;
11217         }
11218
11219       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
11220          compound and let the below cases handle it.  After all, MNEG is a
11221          special-case alias of MSUB.  */
11222       if (GET_CODE (op0) == NEG)
11223         {
11224           op0 = XEXP (op0, 0);
11225           compound_p = true;
11226         }
11227
11228       /* Integer multiplies or FMAs have zero/sign extending variants.  */
11229       if ((GET_CODE (op0) == ZERO_EXTEND
11230            && GET_CODE (op1) == ZERO_EXTEND)
11231           || (GET_CODE (op0) == SIGN_EXTEND
11232               && GET_CODE (op1) == SIGN_EXTEND))
11233         {
11234           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
11235           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
11236
11237           if (speed)
11238             {
11239               if (compound_p)
11240                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
11241                 cost += extra_cost->mult[0].extend_add;
11242               else
11243                 /* MUL/SMULL/UMULL.  */
11244                 cost += extra_cost->mult[0].extend;
11245             }
11246
11247           return cost;
11248         }
11249
11250       /* This is either an integer multiply or a MADD.  In both cases
11251          we want to recurse and cost the operands.  */
11252       cost += rtx_cost (op0, mode, MULT, 0, speed);
11253       cost += rtx_cost (op1, mode, MULT, 1, speed);
11254
11255       if (speed)
11256         {
11257           if (compound_p)
11258             /* MADD/MSUB.  */
11259             cost += extra_cost->mult[mode == DImode].add;
11260           else
11261             /* MUL.  */
11262             cost += extra_cost->mult[mode == DImode].simple;
11263         }
11264
11265       return cost;
11266     }
11267   else
11268     {
11269       if (speed)
11270         {
11271           /* Floating-point FMA/FMUL can also support negations of the
11272              operands, unless the rounding mode is upward or downward in
11273              which case FNMUL is different than FMUL with operand negation.  */
11274           bool neg0 = GET_CODE (op0) == NEG;
11275           bool neg1 = GET_CODE (op1) == NEG;
11276           if (compound_p || !flag_rounding_math || (neg0 && neg1))
11277             {
11278               if (neg0)
11279                 op0 = XEXP (op0, 0);
11280               if (neg1)
11281                 op1 = XEXP (op1, 0);
11282             }
11283
11284           if (compound_p)
11285             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
11286             cost += extra_cost->fp[mode == DFmode].fma;
11287           else
11288             /* FMUL/FNMUL.  */
11289             cost += extra_cost->fp[mode == DFmode].mult;
11290         }
11291
11292       cost += rtx_cost (op0, mode, MULT, 0, speed);
11293       cost += rtx_cost (op1, mode, MULT, 1, speed);
11294       return cost;
11295     }
11296 }
11297
11298 static int
11299 aarch64_address_cost (rtx x,
11300                       machine_mode mode,
11301                       addr_space_t as ATTRIBUTE_UNUSED,
11302                       bool speed)
11303 {
11304   enum rtx_code c = GET_CODE (x);
11305   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
11306   struct aarch64_address_info info;
11307   int cost = 0;
11308   info.shift = 0;
11309
11310   if (!aarch64_classify_address (&info, x, mode, false))
11311     {
11312       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
11313         {
11314           /* This is a CONST or SYMBOL ref which will be split
11315              in a different way depending on the code model in use.
11316              Cost it through the generic infrastructure.  */
11317           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
11318           /* Divide through by the cost of one instruction to
11319              bring it to the same units as the address costs.  */
11320           cost_symbol_ref /= COSTS_N_INSNS (1);
11321           /* The cost is then the cost of preparing the address,
11322              followed by an immediate (possibly 0) offset.  */
11323           return cost_symbol_ref + addr_cost->imm_offset;
11324         }
11325       else
11326         {
11327           /* This is most likely a jump table from a case
11328              statement.  */
11329           return addr_cost->register_offset;
11330         }
11331     }
11332
11333   switch (info.type)
11334     {
11335       case ADDRESS_LO_SUM:
11336       case ADDRESS_SYMBOLIC:
11337       case ADDRESS_REG_IMM:
11338         cost += addr_cost->imm_offset;
11339         break;
11340
11341       case ADDRESS_REG_WB:
11342         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
11343           cost += addr_cost->pre_modify;
11344         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
11345           cost += addr_cost->post_modify;
11346         else
11347           gcc_unreachable ();
11348
11349         break;
11350
11351       case ADDRESS_REG_REG:
11352         cost += addr_cost->register_offset;
11353         break;
11354
11355       case ADDRESS_REG_SXTW:
11356         cost += addr_cost->register_sextend;
11357         break;
11358
11359       case ADDRESS_REG_UXTW:
11360         cost += addr_cost->register_zextend;
11361         break;
11362
11363       default:
11364         gcc_unreachable ();
11365     }
11366
11367
11368   if (info.shift > 0)
11369     {
11370       /* For the sake of calculating the cost of the shifted register
11371          component, we can treat same sized modes in the same way.  */
11372       if (known_eq (GET_MODE_BITSIZE (mode), 16))
11373         cost += addr_cost->addr_scale_costs.hi;
11374       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
11375         cost += addr_cost->addr_scale_costs.si;
11376       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
11377         cost += addr_cost->addr_scale_costs.di;
11378       else
11379         /* We can't tell, or this is a 128-bit vector.  */
11380         cost += addr_cost->addr_scale_costs.ti;
11381     }
11382
11383   return cost;
11384 }
11385
11386 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
11387    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
11388    to be taken.  */
11389
11390 int
11391 aarch64_branch_cost (bool speed_p, bool predictable_p)
11392 {
11393   /* When optimizing for speed, use the cost of unpredictable branches.  */
11394   const struct cpu_branch_cost *branch_costs =
11395     aarch64_tune_params.branch_costs;
11396
11397   if (!speed_p || predictable_p)
11398     return branch_costs->predictable;
11399   else
11400     return branch_costs->unpredictable;
11401 }
11402
11403 /* Return true if the RTX X in mode MODE is a zero or sign extract
11404    usable in an ADD or SUB (extended register) instruction.  */
11405 static bool
11406 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
11407 {
11408   /* Catch add with a sign extract.
11409      This is add_<optab><mode>_multp2.  */
11410   if (GET_CODE (x) == SIGN_EXTRACT
11411       || GET_CODE (x) == ZERO_EXTRACT)
11412     {
11413       rtx op0 = XEXP (x, 0);
11414       rtx op1 = XEXP (x, 1);
11415       rtx op2 = XEXP (x, 2);
11416
11417       if (GET_CODE (op0) == MULT
11418           && CONST_INT_P (op1)
11419           && op2 == const0_rtx
11420           && CONST_INT_P (XEXP (op0, 1))
11421           && aarch64_is_extend_from_extract (mode,
11422                                              XEXP (op0, 1),
11423                                              op1))
11424         {
11425           return true;
11426         }
11427     }
11428   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
11429      No shift.  */
11430   else if (GET_CODE (x) == SIGN_EXTEND
11431            || GET_CODE (x) == ZERO_EXTEND)
11432     return REG_P (XEXP (x, 0));
11433
11434   return false;
11435 }
11436
11437 static bool
11438 aarch64_frint_unspec_p (unsigned int u)
11439 {
11440   switch (u)
11441     {
11442       case UNSPEC_FRINTZ:
11443       case UNSPEC_FRINTP:
11444       case UNSPEC_FRINTM:
11445       case UNSPEC_FRINTA:
11446       case UNSPEC_FRINTN:
11447       case UNSPEC_FRINTX:
11448       case UNSPEC_FRINTI:
11449         return true;
11450
11451       default:
11452         return false;
11453     }
11454 }
11455
11456 /* Return true iff X is an rtx that will match an extr instruction
11457    i.e. as described in the *extr<mode>5_insn family of patterns.
11458    OP0 and OP1 will be set to the operands of the shifts involved
11459    on success and will be NULL_RTX otherwise.  */
11460
11461 static bool
11462 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
11463 {
11464   rtx op0, op1;
11465   scalar_int_mode mode;
11466   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
11467     return false;
11468
11469   *res_op0 = NULL_RTX;
11470   *res_op1 = NULL_RTX;
11471
11472   if (GET_CODE (x) != IOR)
11473     return false;
11474
11475   op0 = XEXP (x, 0);
11476   op1 = XEXP (x, 1);
11477
11478   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
11479       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
11480     {
11481      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
11482       if (GET_CODE (op1) == ASHIFT)
11483         std::swap (op0, op1);
11484
11485       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
11486         return false;
11487
11488       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
11489       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
11490
11491       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
11492           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
11493         {
11494           *res_op0 = XEXP (op0, 0);
11495           *res_op1 = XEXP (op1, 0);
11496           return true;
11497         }
11498     }
11499
11500   return false;
11501 }
11502
11503 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
11504    storing it in *COST.  Result is true if the total cost of the operation
11505    has now been calculated.  */
11506 static bool
11507 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
11508 {
11509   rtx inner;
11510   rtx comparator;
11511   enum rtx_code cmpcode;
11512   const struct cpu_cost_table *extra_cost
11513     = aarch64_tune_params.insn_extra_cost;
11514
11515   if (COMPARISON_P (op0))
11516     {
11517       inner = XEXP (op0, 0);
11518       comparator = XEXP (op0, 1);
11519       cmpcode = GET_CODE (op0);
11520     }
11521   else
11522     {
11523       inner = op0;
11524       comparator = const0_rtx;
11525       cmpcode = NE;
11526     }
11527
11528   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
11529     {
11530       /* Conditional branch.  */
11531       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
11532         return true;
11533       else
11534         {
11535           if (cmpcode == NE || cmpcode == EQ)
11536             {
11537               if (comparator == const0_rtx)
11538                 {
11539                   /* TBZ/TBNZ/CBZ/CBNZ.  */
11540                   if (GET_CODE (inner) == ZERO_EXTRACT)
11541                     /* TBZ/TBNZ.  */
11542                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
11543                                        ZERO_EXTRACT, 0, speed);
11544                   else
11545                     /* CBZ/CBNZ.  */
11546                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
11547
11548                   return true;
11549                 }
11550               if (register_operand (inner, VOIDmode)
11551                   && aarch64_imm24 (comparator, VOIDmode))
11552                 {
11553                   /* SUB and SUBS.  */
11554                   *cost += COSTS_N_INSNS (2);
11555                   if (speed)
11556                     *cost += extra_cost->alu.arith * 2;
11557                   return true;
11558                 }
11559             }
11560           else if (cmpcode == LT || cmpcode == GE)
11561             {
11562               /* TBZ/TBNZ.  */
11563               if (comparator == const0_rtx)
11564                 return true;
11565             }
11566         }
11567     }
11568   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
11569     {
11570       /* CCMP.  */
11571       if (GET_CODE (op1) == COMPARE)
11572         {
11573           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
11574           if (XEXP (op1, 1) == const0_rtx)
11575             *cost += 1;
11576           if (speed)
11577             {
11578               machine_mode mode = GET_MODE (XEXP (op1, 0));
11579               const struct cpu_cost_table *extra_cost
11580                 = aarch64_tune_params.insn_extra_cost;
11581
11582               if (GET_MODE_CLASS (mode) == MODE_INT)
11583                 *cost += extra_cost->alu.arith;
11584               else
11585                 *cost += extra_cost->fp[mode == DFmode].compare;
11586             }
11587           return true;
11588         }
11589
11590       /* It's a conditional operation based on the status flags,
11591          so it must be some flavor of CSEL.  */
11592
11593       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
11594       if (GET_CODE (op1) == NEG
11595           || GET_CODE (op1) == NOT
11596           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
11597         op1 = XEXP (op1, 0);
11598       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
11599         {
11600           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
11601           op1 = XEXP (op1, 0);
11602           op2 = XEXP (op2, 0);
11603         }
11604
11605       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
11606       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
11607       return true;
11608     }
11609
11610   /* We don't know what this is, cost all operands.  */
11611   return false;
11612 }
11613
11614 /* Check whether X is a bitfield operation of the form shift + extend that
11615    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
11616    operand to which the bitfield operation is applied.  Otherwise return
11617    NULL_RTX.  */
11618
11619 static rtx
11620 aarch64_extend_bitfield_pattern_p (rtx x)
11621 {
11622   rtx_code outer_code = GET_CODE (x);
11623   machine_mode outer_mode = GET_MODE (x);
11624
11625   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
11626       && outer_mode != SImode && outer_mode != DImode)
11627     return NULL_RTX;
11628
11629   rtx inner = XEXP (x, 0);
11630   rtx_code inner_code = GET_CODE (inner);
11631   machine_mode inner_mode = GET_MODE (inner);
11632   rtx op = NULL_RTX;
11633
11634   switch (inner_code)
11635     {
11636       case ASHIFT:
11637         if (CONST_INT_P (XEXP (inner, 1))
11638             && (inner_mode == QImode || inner_mode == HImode))
11639           op = XEXP (inner, 0);
11640         break;
11641       case LSHIFTRT:
11642         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
11643             && (inner_mode == QImode || inner_mode == HImode))
11644           op = XEXP (inner, 0);
11645         break;
11646       case ASHIFTRT:
11647         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
11648             && (inner_mode == QImode || inner_mode == HImode))
11649           op = XEXP (inner, 0);
11650         break;
11651       default:
11652         break;
11653     }
11654
11655   return op;
11656 }
11657
11658 /* Return true if the mask and a shift amount from an RTX of the form
11659    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
11660    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
11661
11662 bool
11663 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
11664                                     rtx shft_amnt)
11665 {
11666   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
11667          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
11668          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
11669          && (INTVAL (mask)
11670              & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
11671 }
11672
11673 /* Return true if the masks and a shift amount from an RTX of the form
11674    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
11675    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
11676
11677 bool
11678 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
11679                                    unsigned HOST_WIDE_INT mask1,
11680                                    unsigned HOST_WIDE_INT shft_amnt,
11681                                    unsigned HOST_WIDE_INT mask2)
11682 {
11683   unsigned HOST_WIDE_INT t;
11684
11685   /* Verify that there is no overlap in what bits are set in the two masks.  */
11686   if (mask1 != ~mask2)
11687     return false;
11688
11689   /* Verify that mask2 is not all zeros or ones.  */
11690   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
11691     return false;
11692
11693   /* The shift amount should always be less than the mode size.  */
11694   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
11695
11696   /* Verify that the mask being shifted is contiguous and would be in the
11697      least significant bits after shifting by shft_amnt.  */
11698   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
11699   return (t == (t & -t));
11700 }
11701
11702 /* Calculate the cost of calculating X, storing it in *COST.  Result
11703    is true if the total cost of the operation has now been calculated.  */
11704 static bool
11705 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
11706                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
11707 {
11708   rtx op0, op1, op2;
11709   const struct cpu_cost_table *extra_cost
11710     = aarch64_tune_params.insn_extra_cost;
11711   int code = GET_CODE (x);
11712   scalar_int_mode int_mode;
11713
11714   /* By default, assume that everything has equivalent cost to the
11715      cheapest instruction.  Any additional costs are applied as a delta
11716      above this default.  */
11717   *cost = COSTS_N_INSNS (1);
11718
11719   switch (code)
11720     {
11721     case SET:
11722       /* The cost depends entirely on the operands to SET.  */
11723       *cost = 0;
11724       op0 = SET_DEST (x);
11725       op1 = SET_SRC (x);
11726
11727       switch (GET_CODE (op0))
11728         {
11729         case MEM:
11730           if (speed)
11731             {
11732               rtx address = XEXP (op0, 0);
11733               if (VECTOR_MODE_P (mode))
11734                 *cost += extra_cost->ldst.storev;
11735               else if (GET_MODE_CLASS (mode) == MODE_INT)
11736                 *cost += extra_cost->ldst.store;
11737               else if (mode == SFmode)
11738                 *cost += extra_cost->ldst.storef;
11739               else if (mode == DFmode)
11740                 *cost += extra_cost->ldst.stored;
11741
11742               *cost +=
11743                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11744                                                      0, speed));
11745             }
11746
11747           *cost += rtx_cost (op1, mode, SET, 1, speed);
11748           return true;
11749
11750         case SUBREG:
11751           if (! REG_P (SUBREG_REG (op0)))
11752             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
11753
11754           /* Fall through.  */
11755         case REG:
11756           /* The cost is one per vector-register copied.  */
11757           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
11758             {
11759               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
11760               *cost = COSTS_N_INSNS (nregs);
11761             }
11762           /* const0_rtx is in general free, but we will use an
11763              instruction to set a register to 0.  */
11764           else if (REG_P (op1) || op1 == const0_rtx)
11765             {
11766               /* The cost is 1 per register copied.  */
11767               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
11768               *cost = COSTS_N_INSNS (nregs);
11769             }
11770           else
11771             /* Cost is just the cost of the RHS of the set.  */
11772             *cost += rtx_cost (op1, mode, SET, 1, speed);
11773           return true;
11774
11775         case ZERO_EXTRACT:
11776         case SIGN_EXTRACT:
11777           /* Bit-field insertion.  Strip any redundant widening of
11778              the RHS to meet the width of the target.  */
11779           if (GET_CODE (op1) == SUBREG)
11780             op1 = SUBREG_REG (op1);
11781           if ((GET_CODE (op1) == ZERO_EXTEND
11782                || GET_CODE (op1) == SIGN_EXTEND)
11783               && CONST_INT_P (XEXP (op0, 1))
11784               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
11785               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
11786             op1 = XEXP (op1, 0);
11787
11788           if (CONST_INT_P (op1))
11789             {
11790               /* MOV immediate is assumed to always be cheap.  */
11791               *cost = COSTS_N_INSNS (1);
11792             }
11793           else
11794             {
11795               /* BFM.  */
11796               if (speed)
11797                 *cost += extra_cost->alu.bfi;
11798               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
11799             }
11800
11801           return true;
11802
11803         default:
11804           /* We can't make sense of this, assume default cost.  */
11805           *cost = COSTS_N_INSNS (1);
11806           return false;
11807         }
11808       return false;
11809
11810     case CONST_INT:
11811       /* If an instruction can incorporate a constant within the
11812          instruction, the instruction's expression avoids calling
11813          rtx_cost() on the constant.  If rtx_cost() is called on a
11814          constant, then it is usually because the constant must be
11815          moved into a register by one or more instructions.
11816
11817          The exception is constant 0, which can be expressed
11818          as XZR/WZR and is therefore free.  The exception to this is
11819          if we have (set (reg) (const0_rtx)) in which case we must cost
11820          the move.  However, we can catch that when we cost the SET, so
11821          we don't need to consider that here.  */
11822       if (x == const0_rtx)
11823         *cost = 0;
11824       else
11825         {
11826           /* To an approximation, building any other constant is
11827              proportionally expensive to the number of instructions
11828              required to build that constant.  This is true whether we
11829              are compiling for SPEED or otherwise.  */
11830           if (!is_a <scalar_int_mode> (mode, &int_mode))
11831             int_mode = word_mode;
11832           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
11833                                  (NULL_RTX, x, false, int_mode));
11834         }
11835       return true;
11836
11837     case CONST_DOUBLE:
11838
11839       /* First determine number of instructions to do the move
11840           as an integer constant.  */
11841       if (!aarch64_float_const_representable_p (x)
11842            && !aarch64_can_const_movi_rtx_p (x, mode)
11843            && aarch64_float_const_rtx_p (x))
11844         {
11845           unsigned HOST_WIDE_INT ival;
11846           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
11847           gcc_assert (succeed);
11848
11849           scalar_int_mode imode = (mode == HFmode
11850                                    ? SImode
11851                                    : int_mode_for_mode (mode).require ());
11852           int ncost = aarch64_internal_mov_immediate
11853                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
11854           *cost += COSTS_N_INSNS (ncost);
11855           return true;
11856         }
11857
11858       if (speed)
11859         {
11860           /* mov[df,sf]_aarch64.  */
11861           if (aarch64_float_const_representable_p (x))
11862             /* FMOV (scalar immediate).  */
11863             *cost += extra_cost->fp[mode == DFmode].fpconst;
11864           else if (!aarch64_float_const_zero_rtx_p (x))
11865             {
11866               /* This will be a load from memory.  */
11867               if (mode == DFmode)
11868                 *cost += extra_cost->ldst.loadd;
11869               else
11870                 *cost += extra_cost->ldst.loadf;
11871             }
11872           else
11873             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
11874                or MOV v0.s[0], wzr - neither of which are modeled by the
11875                cost tables.  Just use the default cost.  */
11876             {
11877             }
11878         }
11879
11880       return true;
11881
11882     case MEM:
11883       if (speed)
11884         {
11885           /* For loads we want the base cost of a load, plus an
11886              approximation for the additional cost of the addressing
11887              mode.  */
11888           rtx address = XEXP (x, 0);
11889           if (VECTOR_MODE_P (mode))
11890             *cost += extra_cost->ldst.loadv;
11891           else if (GET_MODE_CLASS (mode) == MODE_INT)
11892             *cost += extra_cost->ldst.load;
11893           else if (mode == SFmode)
11894             *cost += extra_cost->ldst.loadf;
11895           else if (mode == DFmode)
11896             *cost += extra_cost->ldst.loadd;
11897
11898           *cost +=
11899                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11900                                                      0, speed));
11901         }
11902
11903       return true;
11904
11905     case NEG:
11906       op0 = XEXP (x, 0);
11907
11908       if (VECTOR_MODE_P (mode))
11909         {
11910           if (speed)
11911             {
11912               /* FNEG.  */
11913               *cost += extra_cost->vect.alu;
11914             }
11915           return false;
11916         }
11917
11918       if (GET_MODE_CLASS (mode) == MODE_INT)
11919         {
11920           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
11921               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
11922             {
11923               /* CSETM.  */
11924               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
11925               return true;
11926             }
11927
11928           /* Cost this as SUB wzr, X.  */
11929           op0 = CONST0_RTX (mode);
11930           op1 = XEXP (x, 0);
11931           goto cost_minus;
11932         }
11933
11934       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11935         {
11936           /* Support (neg(fma...)) as a single instruction only if
11937              sign of zeros is unimportant.  This matches the decision
11938              making in aarch64.md.  */
11939           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
11940             {
11941               /* FNMADD.  */
11942               *cost = rtx_cost (op0, mode, NEG, 0, speed);
11943               return true;
11944             }
11945           if (GET_CODE (op0) == MULT)
11946             {
11947               /* FNMUL.  */
11948               *cost = rtx_cost (op0, mode, NEG, 0, speed);
11949               return true;
11950             }
11951           if (speed)
11952             /* FNEG.  */
11953             *cost += extra_cost->fp[mode == DFmode].neg;
11954           return false;
11955         }
11956
11957       return false;
11958
11959     case CLRSB:
11960     case CLZ:
11961       if (speed)
11962         {
11963           if (VECTOR_MODE_P (mode))
11964             *cost += extra_cost->vect.alu;
11965           else
11966             *cost += extra_cost->alu.clz;
11967         }
11968
11969       return false;
11970
11971     case CTZ:
11972       *cost = COSTS_N_INSNS (2);
11973
11974       if (speed)
11975         *cost += extra_cost->alu.clz + extra_cost->alu.rev;
11976       return false;
11977
11978     case COMPARE:
11979       op0 = XEXP (x, 0);
11980       op1 = XEXP (x, 1);
11981
11982       if (op1 == const0_rtx
11983           && GET_CODE (op0) == AND)
11984         {
11985           x = op0;
11986           mode = GET_MODE (op0);
11987           goto cost_logic;
11988         }
11989
11990       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
11991         {
11992           /* TODO: A write to the CC flags possibly costs extra, this
11993              needs encoding in the cost tables.  */
11994
11995           mode = GET_MODE (op0);
11996           /* ANDS.  */
11997           if (GET_CODE (op0) == AND)
11998             {
11999               x = op0;
12000               goto cost_logic;
12001             }
12002
12003           if (GET_CODE (op0) == PLUS)
12004             {
12005               /* ADDS (and CMN alias).  */
12006               x = op0;
12007               goto cost_plus;
12008             }
12009
12010           if (GET_CODE (op0) == MINUS)
12011             {
12012               /* SUBS.  */
12013               x = op0;
12014               goto cost_minus;
12015             }
12016
12017           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
12018               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
12019               && CONST_INT_P (XEXP (op0, 2)))
12020             {
12021               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
12022                  Handle it here directly rather than going to cost_logic
12023                  since we know the immediate generated for the TST is valid
12024                  so we can avoid creating an intermediate rtx for it only
12025                  for costing purposes.  */
12026               if (speed)
12027                 *cost += extra_cost->alu.logical;
12028
12029               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
12030                                  ZERO_EXTRACT, 0, speed);
12031               return true;
12032             }
12033
12034           if (GET_CODE (op1) == NEG)
12035             {
12036               /* CMN.  */
12037               if (speed)
12038                 *cost += extra_cost->alu.arith;
12039
12040               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
12041               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
12042               return true;
12043             }
12044
12045           /* CMP.
12046
12047              Compare can freely swap the order of operands, and
12048              canonicalization puts the more complex operation first.
12049              But the integer MINUS logic expects the shift/extend
12050              operation in op1.  */
12051           if (! (REG_P (op0)
12052                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
12053           {
12054             op0 = XEXP (x, 1);
12055             op1 = XEXP (x, 0);
12056           }
12057           goto cost_minus;
12058         }
12059
12060       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
12061         {
12062           /* FCMP.  */
12063           if (speed)
12064             *cost += extra_cost->fp[mode == DFmode].compare;
12065
12066           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
12067             {
12068               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
12069               /* FCMP supports constant 0.0 for no extra cost. */
12070               return true;
12071             }
12072           return false;
12073         }
12074
12075       if (VECTOR_MODE_P (mode))
12076         {
12077           /* Vector compare.  */
12078           if (speed)
12079             *cost += extra_cost->vect.alu;
12080
12081           if (aarch64_float_const_zero_rtx_p (op1))
12082             {
12083               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
12084                  cost.  */
12085               return true;
12086             }
12087           return false;
12088         }
12089       return false;
12090
12091     case MINUS:
12092       {
12093         op0 = XEXP (x, 0);
12094         op1 = XEXP (x, 1);
12095
12096 cost_minus:
12097         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
12098
12099         /* Detect valid immediates.  */
12100         if ((GET_MODE_CLASS (mode) == MODE_INT
12101              || (GET_MODE_CLASS (mode) == MODE_CC
12102                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
12103             && CONST_INT_P (op1)
12104             && aarch64_uimm12_shift (INTVAL (op1)))
12105           {
12106             if (speed)
12107               /* SUB(S) (immediate).  */
12108               *cost += extra_cost->alu.arith;
12109             return true;
12110           }
12111
12112         /* Look for SUB (extended register).  */
12113         if (is_a <scalar_int_mode> (mode, &int_mode)
12114             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
12115           {
12116             if (speed)
12117               *cost += extra_cost->alu.extend_arith;
12118
12119             op1 = aarch64_strip_extend (op1, true);
12120             *cost += rtx_cost (op1, VOIDmode,
12121                                (enum rtx_code) GET_CODE (op1), 0, speed);
12122             return true;
12123           }
12124
12125         rtx new_op1 = aarch64_strip_extend (op1, false);
12126
12127         /* Cost this as an FMA-alike operation.  */
12128         if ((GET_CODE (new_op1) == MULT
12129              || aarch64_shift_p (GET_CODE (new_op1)))
12130             && code != COMPARE)
12131           {
12132             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
12133                                             (enum rtx_code) code,
12134                                             speed);
12135             return true;
12136           }
12137
12138         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
12139
12140         if (speed)
12141           {
12142             if (VECTOR_MODE_P (mode))
12143               {
12144                 /* Vector SUB.  */
12145                 *cost += extra_cost->vect.alu;
12146               }
12147             else if (GET_MODE_CLASS (mode) == MODE_INT)
12148               {
12149                 /* SUB(S).  */
12150                 *cost += extra_cost->alu.arith;
12151               }
12152             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12153               {
12154                 /* FSUB.  */
12155                 *cost += extra_cost->fp[mode == DFmode].addsub;
12156               }
12157           }
12158         return true;
12159       }
12160
12161     case PLUS:
12162       {
12163         rtx new_op0;
12164
12165         op0 = XEXP (x, 0);
12166         op1 = XEXP (x, 1);
12167
12168 cost_plus:
12169         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
12170             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
12171           {
12172             /* CSINC.  */
12173             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
12174             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
12175             return true;
12176           }
12177
12178         if (GET_MODE_CLASS (mode) == MODE_INT
12179             && (aarch64_plus_immediate (op1, mode)
12180                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
12181           {
12182             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
12183
12184             if (speed)
12185               /* ADD (immediate).  */
12186               *cost += extra_cost->alu.arith;
12187             return true;
12188           }
12189
12190         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
12191
12192         /* Look for ADD (extended register).  */
12193         if (is_a <scalar_int_mode> (mode, &int_mode)
12194             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
12195           {
12196             if (speed)
12197               *cost += extra_cost->alu.extend_arith;
12198
12199             op0 = aarch64_strip_extend (op0, true);
12200             *cost += rtx_cost (op0, VOIDmode,
12201                                (enum rtx_code) GET_CODE (op0), 0, speed);
12202             return true;
12203           }
12204
12205         /* Strip any extend, leave shifts behind as we will
12206            cost them through mult_cost.  */
12207         new_op0 = aarch64_strip_extend (op0, false);
12208
12209         if (GET_CODE (new_op0) == MULT
12210             || aarch64_shift_p (GET_CODE (new_op0)))
12211           {
12212             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
12213                                             speed);
12214             return true;
12215           }
12216
12217         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
12218
12219         if (speed)
12220           {
12221             if (VECTOR_MODE_P (mode))
12222               {
12223                 /* Vector ADD.  */
12224                 *cost += extra_cost->vect.alu;
12225               }
12226             else if (GET_MODE_CLASS (mode) == MODE_INT)
12227               {
12228                 /* ADD.  */
12229                 *cost += extra_cost->alu.arith;
12230               }
12231             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12232               {
12233                 /* FADD.  */
12234                 *cost += extra_cost->fp[mode == DFmode].addsub;
12235               }
12236           }
12237         return true;
12238       }
12239
12240     case BSWAP:
12241       *cost = COSTS_N_INSNS (1);
12242
12243       if (speed)
12244         {
12245           if (VECTOR_MODE_P (mode))
12246             *cost += extra_cost->vect.alu;
12247           else
12248             *cost += extra_cost->alu.rev;
12249         }
12250       return false;
12251
12252     case IOR:
12253       if (aarch_rev16_p (x))
12254         {
12255           *cost = COSTS_N_INSNS (1);
12256
12257           if (speed)
12258             {
12259               if (VECTOR_MODE_P (mode))
12260                 *cost += extra_cost->vect.alu;
12261               else
12262                 *cost += extra_cost->alu.rev;
12263             }
12264           return true;
12265         }
12266
12267       if (aarch64_extr_rtx_p (x, &op0, &op1))
12268         {
12269           *cost += rtx_cost (op0, mode, IOR, 0, speed);
12270           *cost += rtx_cost (op1, mode, IOR, 1, speed);
12271           if (speed)
12272             *cost += extra_cost->alu.shift;
12273
12274           return true;
12275         }
12276     /* Fall through.  */
12277     case XOR:
12278     case AND:
12279     cost_logic:
12280       op0 = XEXP (x, 0);
12281       op1 = XEXP (x, 1);
12282
12283       if (VECTOR_MODE_P (mode))
12284         {
12285           if (speed)
12286             *cost += extra_cost->vect.alu;
12287           return true;
12288         }
12289
12290       if (code == AND
12291           && GET_CODE (op0) == MULT
12292           && CONST_INT_P (XEXP (op0, 1))
12293           && CONST_INT_P (op1)
12294           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
12295                                INTVAL (op1)) != 0)
12296         {
12297           /* This is a UBFM/SBFM.  */
12298           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
12299           if (speed)
12300             *cost += extra_cost->alu.bfx;
12301           return true;
12302         }
12303
12304       if (is_int_mode (mode, &int_mode))
12305         {
12306           if (CONST_INT_P (op1))
12307             {
12308               /* We have a mask + shift version of a UBFIZ
12309                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
12310               if (GET_CODE (op0) == ASHIFT
12311                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
12312                                                          XEXP (op0, 1)))
12313                 {
12314                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
12315                                      (enum rtx_code) code, 0, speed);
12316                   if (speed)
12317                     *cost += extra_cost->alu.bfx;
12318
12319                   return true;
12320                 }
12321               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
12322                 {
12323                 /* We possibly get the immediate for free, this is not
12324                    modelled.  */
12325                   *cost += rtx_cost (op0, int_mode,
12326                                      (enum rtx_code) code, 0, speed);
12327                   if (speed)
12328                     *cost += extra_cost->alu.logical;
12329
12330                   return true;
12331                 }
12332             }
12333           else
12334             {
12335               rtx new_op0 = op0;
12336
12337               /* Handle ORN, EON, or BIC.  */
12338               if (GET_CODE (op0) == NOT)
12339                 op0 = XEXP (op0, 0);
12340
12341               new_op0 = aarch64_strip_shift (op0);
12342
12343               /* If we had a shift on op0 then this is a logical-shift-
12344                  by-register/immediate operation.  Otherwise, this is just
12345                  a logical operation.  */
12346               if (speed)
12347                 {
12348                   if (new_op0 != op0)
12349                     {
12350                       /* Shift by immediate.  */
12351                       if (CONST_INT_P (XEXP (op0, 1)))
12352                         *cost += extra_cost->alu.log_shift;
12353                       else
12354                         *cost += extra_cost->alu.log_shift_reg;
12355                     }
12356                   else
12357                     *cost += extra_cost->alu.logical;
12358                 }
12359
12360               /* In both cases we want to cost both operands.  */
12361               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
12362                                  0, speed);
12363               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
12364                                  1, speed);
12365
12366               return true;
12367             }
12368         }
12369       return false;
12370
12371     case NOT:
12372       x = XEXP (x, 0);
12373       op0 = aarch64_strip_shift (x);
12374
12375       if (VECTOR_MODE_P (mode))
12376         {
12377           /* Vector NOT.  */
12378           *cost += extra_cost->vect.alu;
12379           return false;
12380         }
12381
12382       /* MVN-shifted-reg.  */
12383       if (op0 != x)
12384         {
12385           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
12386
12387           if (speed)
12388             *cost += extra_cost->alu.log_shift;
12389
12390           return true;
12391         }
12392       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
12393          Handle the second form here taking care that 'a' in the above can
12394          be a shift.  */
12395       else if (GET_CODE (op0) == XOR)
12396         {
12397           rtx newop0 = XEXP (op0, 0);
12398           rtx newop1 = XEXP (op0, 1);
12399           rtx op0_stripped = aarch64_strip_shift (newop0);
12400
12401           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
12402           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
12403
12404           if (speed)
12405             {
12406               if (op0_stripped != newop0)
12407                 *cost += extra_cost->alu.log_shift;
12408               else
12409                 *cost += extra_cost->alu.logical;
12410             }
12411
12412           return true;
12413         }
12414       /* MVN.  */
12415       if (speed)
12416         *cost += extra_cost->alu.logical;
12417
12418       return false;
12419
12420     case ZERO_EXTEND:
12421
12422       op0 = XEXP (x, 0);
12423       /* If a value is written in SI mode, then zero extended to DI
12424          mode, the operation will in general be free as a write to
12425          a 'w' register implicitly zeroes the upper bits of an 'x'
12426          register.  However, if this is
12427
12428            (set (reg) (zero_extend (reg)))
12429
12430          we must cost the explicit register move.  */
12431       if (mode == DImode
12432           && GET_MODE (op0) == SImode
12433           && outer == SET)
12434         {
12435           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
12436
12437         /* If OP_COST is non-zero, then the cost of the zero extend
12438            is effectively the cost of the inner operation.  Otherwise
12439            we have a MOV instruction and we take the cost from the MOV
12440            itself.  This is true independently of whether we are
12441            optimizing for space or time.  */
12442           if (op_cost)
12443             *cost = op_cost;
12444
12445           return true;
12446         }
12447       else if (MEM_P (op0))
12448         {
12449           /* All loads can zero extend to any size for free.  */
12450           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
12451           return true;
12452         }
12453
12454       op0 = aarch64_extend_bitfield_pattern_p (x);
12455       if (op0)
12456         {
12457           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
12458           if (speed)
12459             *cost += extra_cost->alu.bfx;
12460           return true;
12461         }
12462
12463       if (speed)
12464         {
12465           if (VECTOR_MODE_P (mode))
12466             {
12467               /* UMOV.  */
12468               *cost += extra_cost->vect.alu;
12469             }
12470           else
12471             {
12472               /* We generate an AND instead of UXTB/UXTH.  */
12473               *cost += extra_cost->alu.logical;
12474             }
12475         }
12476       return false;
12477
12478     case SIGN_EXTEND:
12479       if (MEM_P (XEXP (x, 0)))
12480         {
12481           /* LDRSH.  */
12482           if (speed)
12483             {
12484               rtx address = XEXP (XEXP (x, 0), 0);
12485               *cost += extra_cost->ldst.load_sign_extend;
12486
12487               *cost +=
12488                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
12489                                                      0, speed));
12490             }
12491           return true;
12492         }
12493
12494       op0 = aarch64_extend_bitfield_pattern_p (x);
12495       if (op0)
12496         {
12497           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
12498           if (speed)
12499             *cost += extra_cost->alu.bfx;
12500           return true;
12501         }
12502
12503       if (speed)
12504         {
12505           if (VECTOR_MODE_P (mode))
12506             *cost += extra_cost->vect.alu;
12507           else
12508             *cost += extra_cost->alu.extend;
12509         }
12510       return false;
12511
12512     case ASHIFT:
12513       op0 = XEXP (x, 0);
12514       op1 = XEXP (x, 1);
12515
12516       if (CONST_INT_P (op1))
12517         {
12518           if (speed)
12519             {
12520               if (VECTOR_MODE_P (mode))
12521                 {
12522                   /* Vector shift (immediate).  */
12523                   *cost += extra_cost->vect.alu;
12524                 }
12525               else
12526                 {
12527                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
12528                      aliases.  */
12529                   *cost += extra_cost->alu.shift;
12530                 }
12531             }
12532
12533           /* We can incorporate zero/sign extend for free.  */
12534           if (GET_CODE (op0) == ZERO_EXTEND
12535               || GET_CODE (op0) == SIGN_EXTEND)
12536             op0 = XEXP (op0, 0);
12537
12538           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
12539           return true;
12540         }
12541       else
12542         {
12543           if (VECTOR_MODE_P (mode))
12544             {
12545               if (speed)
12546                 /* Vector shift (register).  */
12547                 *cost += extra_cost->vect.alu;
12548             }
12549           else
12550             {
12551               if (speed)
12552                 /* LSLV.  */
12553                 *cost += extra_cost->alu.shift_reg;
12554
12555               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
12556                   && CONST_INT_P (XEXP (op1, 1))
12557                   && known_eq (INTVAL (XEXP (op1, 1)),
12558                                GET_MODE_BITSIZE (mode) - 1))
12559                 {
12560                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
12561                   /* We already demanded XEXP (op1, 0) to be REG_P, so
12562                      don't recurse into it.  */
12563                   return true;
12564                 }
12565             }
12566           return false;  /* All arguments need to be in registers.  */
12567         }
12568
12569     case ROTATE:
12570     case ROTATERT:
12571     case LSHIFTRT:
12572     case ASHIFTRT:
12573       op0 = XEXP (x, 0);
12574       op1 = XEXP (x, 1);
12575
12576       if (CONST_INT_P (op1))
12577         {
12578           /* ASR (immediate) and friends.  */
12579           if (speed)
12580             {
12581               if (VECTOR_MODE_P (mode))
12582                 *cost += extra_cost->vect.alu;
12583               else
12584                 *cost += extra_cost->alu.shift;
12585             }
12586
12587           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
12588           return true;
12589         }
12590       else
12591         {
12592           if (VECTOR_MODE_P (mode))
12593             {
12594               if (speed)
12595                 /* Vector shift (register).  */
12596                 *cost += extra_cost->vect.alu;
12597             }
12598           else
12599             {
12600               if (speed)
12601                 /* ASR (register) and friends.  */
12602                 *cost += extra_cost->alu.shift_reg;
12603
12604               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
12605                   && CONST_INT_P (XEXP (op1, 1))
12606                   && known_eq (INTVAL (XEXP (op1, 1)),
12607                                GET_MODE_BITSIZE (mode) - 1))
12608                 {
12609                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
12610                   /* We already demanded XEXP (op1, 0) to be REG_P, so
12611                      don't recurse into it.  */
12612                   return true;
12613                 }
12614             }
12615           return false;  /* All arguments need to be in registers.  */
12616         }
12617
12618     case SYMBOL_REF:
12619
12620       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
12621           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
12622         {
12623           /* LDR.  */
12624           if (speed)
12625             *cost += extra_cost->ldst.load;
12626         }
12627       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
12628                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
12629         {
12630           /* ADRP, followed by ADD.  */
12631           *cost += COSTS_N_INSNS (1);
12632           if (speed)
12633             *cost += 2 * extra_cost->alu.arith;
12634         }
12635       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
12636                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
12637         {
12638           /* ADR.  */
12639           if (speed)
12640             *cost += extra_cost->alu.arith;
12641         }
12642
12643       if (flag_pic)
12644         {
12645           /* One extra load instruction, after accessing the GOT.  */
12646           *cost += COSTS_N_INSNS (1);
12647           if (speed)
12648             *cost += extra_cost->ldst.load;
12649         }
12650       return true;
12651
12652     case HIGH:
12653     case LO_SUM:
12654       /* ADRP/ADD (immediate).  */
12655       if (speed)
12656         *cost += extra_cost->alu.arith;
12657       return true;
12658
12659     case ZERO_EXTRACT:
12660     case SIGN_EXTRACT:
12661       /* UBFX/SBFX.  */
12662       if (speed)
12663         {
12664           if (VECTOR_MODE_P (mode))
12665             *cost += extra_cost->vect.alu;
12666           else
12667             *cost += extra_cost->alu.bfx;
12668         }
12669
12670       /* We can trust that the immediates used will be correct (there
12671          are no by-register forms), so we need only cost op0.  */
12672       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
12673       return true;
12674
12675     case MULT:
12676       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
12677       /* aarch64_rtx_mult_cost always handles recursion to its
12678          operands.  */
12679       return true;
12680
12681     case MOD:
12682     /* We can expand signed mod by power of 2 using a NEGS, two parallel
12683        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
12684        an unconditional negate.  This case should only ever be reached through
12685        the set_smod_pow2_cheap check in expmed.c.  */
12686       if (CONST_INT_P (XEXP (x, 1))
12687           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
12688           && (mode == SImode || mode == DImode))
12689         {
12690           /* We expand to 4 instructions.  Reset the baseline.  */
12691           *cost = COSTS_N_INSNS (4);
12692
12693           if (speed)
12694             *cost += 2 * extra_cost->alu.logical
12695                      + 2 * extra_cost->alu.arith;
12696
12697           return true;
12698         }
12699
12700     /* Fall-through.  */
12701     case UMOD:
12702       if (speed)
12703         {
12704           /* Slighly prefer UMOD over SMOD.  */
12705           if (VECTOR_MODE_P (mode))
12706             *cost += extra_cost->vect.alu;
12707           else if (GET_MODE_CLASS (mode) == MODE_INT)
12708             *cost += (extra_cost->mult[mode == DImode].add
12709                       + extra_cost->mult[mode == DImode].idiv
12710                       + (code == MOD ? 1 : 0));
12711         }
12712       return false;  /* All arguments need to be in registers.  */
12713
12714     case DIV:
12715     case UDIV:
12716     case SQRT:
12717       if (speed)
12718         {
12719           if (VECTOR_MODE_P (mode))
12720             *cost += extra_cost->vect.alu;
12721           else if (GET_MODE_CLASS (mode) == MODE_INT)
12722             /* There is no integer SQRT, so only DIV and UDIV can get
12723                here.  */
12724             *cost += (extra_cost->mult[mode == DImode].idiv
12725                      /* Slighly prefer UDIV over SDIV.  */
12726                      + (code == DIV ? 1 : 0));
12727           else
12728             *cost += extra_cost->fp[mode == DFmode].div;
12729         }
12730       return false;  /* All arguments need to be in registers.  */
12731
12732     case IF_THEN_ELSE:
12733       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
12734                                          XEXP (x, 2), cost, speed);
12735
12736     case EQ:
12737     case NE:
12738     case GT:
12739     case GTU:
12740     case LT:
12741     case LTU:
12742     case GE:
12743     case GEU:
12744     case LE:
12745     case LEU:
12746
12747       return false; /* All arguments must be in registers.  */
12748
12749     case FMA:
12750       op0 = XEXP (x, 0);
12751       op1 = XEXP (x, 1);
12752       op2 = XEXP (x, 2);
12753
12754       if (speed)
12755         {
12756           if (VECTOR_MODE_P (mode))
12757             *cost += extra_cost->vect.alu;
12758           else
12759             *cost += extra_cost->fp[mode == DFmode].fma;
12760         }
12761
12762       /* FMSUB, FNMADD, and FNMSUB are free.  */
12763       if (GET_CODE (op0) == NEG)
12764         op0 = XEXP (op0, 0);
12765
12766       if (GET_CODE (op2) == NEG)
12767         op2 = XEXP (op2, 0);
12768
12769       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
12770          and the by-element operand as operand 0.  */
12771       if (GET_CODE (op1) == NEG)
12772         op1 = XEXP (op1, 0);
12773
12774       /* Catch vector-by-element operations.  The by-element operand can
12775          either be (vec_duplicate (vec_select (x))) or just
12776          (vec_select (x)), depending on whether we are multiplying by
12777          a vector or a scalar.
12778
12779          Canonicalization is not very good in these cases, FMA4 will put the
12780          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
12781       if (GET_CODE (op0) == VEC_DUPLICATE)
12782         op0 = XEXP (op0, 0);
12783       else if (GET_CODE (op1) == VEC_DUPLICATE)
12784         op1 = XEXP (op1, 0);
12785
12786       if (GET_CODE (op0) == VEC_SELECT)
12787         op0 = XEXP (op0, 0);
12788       else if (GET_CODE (op1) == VEC_SELECT)
12789         op1 = XEXP (op1, 0);
12790
12791       /* If the remaining parameters are not registers,
12792          get the cost to put them into registers.  */
12793       *cost += rtx_cost (op0, mode, FMA, 0, speed);
12794       *cost += rtx_cost (op1, mode, FMA, 1, speed);
12795       *cost += rtx_cost (op2, mode, FMA, 2, speed);
12796       return true;
12797
12798     case FLOAT:
12799     case UNSIGNED_FLOAT:
12800       if (speed)
12801         *cost += extra_cost->fp[mode == DFmode].fromint;
12802       return false;
12803
12804     case FLOAT_EXTEND:
12805       if (speed)
12806         {
12807           if (VECTOR_MODE_P (mode))
12808             {
12809               /*Vector truncate.  */
12810               *cost += extra_cost->vect.alu;
12811             }
12812           else
12813             *cost += extra_cost->fp[mode == DFmode].widen;
12814         }
12815       return false;
12816
12817     case FLOAT_TRUNCATE:
12818       if (speed)
12819         {
12820           if (VECTOR_MODE_P (mode))
12821             {
12822               /*Vector conversion.  */
12823               *cost += extra_cost->vect.alu;
12824             }
12825           else
12826             *cost += extra_cost->fp[mode == DFmode].narrow;
12827         }
12828       return false;
12829
12830     case FIX:
12831     case UNSIGNED_FIX:
12832       x = XEXP (x, 0);
12833       /* Strip the rounding part.  They will all be implemented
12834          by the fcvt* family of instructions anyway.  */
12835       if (GET_CODE (x) == UNSPEC)
12836         {
12837           unsigned int uns_code = XINT (x, 1);
12838
12839           if (uns_code == UNSPEC_FRINTA
12840               || uns_code == UNSPEC_FRINTM
12841               || uns_code == UNSPEC_FRINTN
12842               || uns_code == UNSPEC_FRINTP
12843               || uns_code == UNSPEC_FRINTZ)
12844             x = XVECEXP (x, 0, 0);
12845         }
12846
12847       if (speed)
12848         {
12849           if (VECTOR_MODE_P (mode))
12850             *cost += extra_cost->vect.alu;
12851           else
12852             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
12853         }
12854
12855       /* We can combine fmul by a power of 2 followed by a fcvt into a single
12856          fixed-point fcvt.  */
12857       if (GET_CODE (x) == MULT
12858           && ((VECTOR_MODE_P (mode)
12859                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
12860               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
12861         {
12862           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
12863                              0, speed);
12864           return true;
12865         }
12866
12867       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
12868       return true;
12869
12870     case ABS:
12871       if (VECTOR_MODE_P (mode))
12872         {
12873           /* ABS (vector).  */
12874           if (speed)
12875             *cost += extra_cost->vect.alu;
12876         }
12877       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12878         {
12879           op0 = XEXP (x, 0);
12880
12881           /* FABD, which is analogous to FADD.  */
12882           if (GET_CODE (op0) == MINUS)
12883             {
12884               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
12885               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
12886               if (speed)
12887                 *cost += extra_cost->fp[mode == DFmode].addsub;
12888
12889               return true;
12890             }
12891           /* Simple FABS is analogous to FNEG.  */
12892           if (speed)
12893             *cost += extra_cost->fp[mode == DFmode].neg;
12894         }
12895       else
12896         {
12897           /* Integer ABS will either be split to
12898              two arithmetic instructions, or will be an ABS
12899              (scalar), which we don't model.  */
12900           *cost = COSTS_N_INSNS (2);
12901           if (speed)
12902             *cost += 2 * extra_cost->alu.arith;
12903         }
12904       return false;
12905
12906     case SMAX:
12907     case SMIN:
12908       if (speed)
12909         {
12910           if (VECTOR_MODE_P (mode))
12911             *cost += extra_cost->vect.alu;
12912           else
12913             {
12914               /* FMAXNM/FMINNM/FMAX/FMIN.
12915                  TODO: This may not be accurate for all implementations, but
12916                  we do not model this in the cost tables.  */
12917               *cost += extra_cost->fp[mode == DFmode].addsub;
12918             }
12919         }
12920       return false;
12921
12922     case UNSPEC:
12923       /* The floating point round to integer frint* instructions.  */
12924       if (aarch64_frint_unspec_p (XINT (x, 1)))
12925         {
12926           if (speed)
12927             *cost += extra_cost->fp[mode == DFmode].roundint;
12928
12929           return false;
12930         }
12931
12932       if (XINT (x, 1) == UNSPEC_RBIT)
12933         {
12934           if (speed)
12935             *cost += extra_cost->alu.rev;
12936
12937           return false;
12938         }
12939       break;
12940
12941     case TRUNCATE:
12942
12943       /* Decompose <su>muldi3_highpart.  */
12944       if (/* (truncate:DI  */
12945           mode == DImode
12946           /*   (lshiftrt:TI  */
12947           && GET_MODE (XEXP (x, 0)) == TImode
12948           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
12949           /*      (mult:TI  */
12950           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
12951           /*        (ANY_EXTEND:TI (reg:DI))
12952                     (ANY_EXTEND:TI (reg:DI)))  */
12953           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
12954                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
12955               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
12956                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
12957           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
12958           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
12959           /*     (const_int 64)  */
12960           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
12961           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
12962         {
12963           /* UMULH/SMULH.  */
12964           if (speed)
12965             *cost += extra_cost->mult[mode == DImode].extend;
12966           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
12967                              mode, MULT, 0, speed);
12968           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
12969                              mode, MULT, 1, speed);
12970           return true;
12971         }
12972
12973       /* Fall through.  */
12974     default:
12975       break;
12976     }
12977
12978   if (dump_file
12979       && flag_aarch64_verbose_cost)
12980     fprintf (dump_file,
12981       "\nFailed to cost RTX.  Assuming default cost.\n");
12982
12983   return true;
12984 }
12985
12986 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
12987    calculated for X.  This cost is stored in *COST.  Returns true
12988    if the total cost of X was calculated.  */
12989 static bool
12990 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
12991                    int param, int *cost, bool speed)
12992 {
12993   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
12994
12995   if (dump_file
12996       && flag_aarch64_verbose_cost)
12997     {
12998       print_rtl_single (dump_file, x);
12999       fprintf (dump_file, "\n%s cost: %d (%s)\n",
13000                speed ? "Hot" : "Cold",
13001                *cost, result ? "final" : "partial");
13002     }
13003
13004   return result;
13005 }
13006
13007 static int
13008 aarch64_register_move_cost (machine_mode mode,
13009                             reg_class_t from_i, reg_class_t to_i)
13010 {
13011   enum reg_class from = (enum reg_class) from_i;
13012   enum reg_class to = (enum reg_class) to_i;
13013   const struct cpu_regmove_cost *regmove_cost
13014     = aarch64_tune_params.regmove_cost;
13015
13016   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
13017   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
13018     to = GENERAL_REGS;
13019
13020   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
13021     from = GENERAL_REGS;
13022
13023   /* Make RDFFR very expensive.  In particular, if we know that the FFR
13024      contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
13025      as a way of obtaining a PTRUE.  */
13026   if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
13027       && hard_reg_set_subset_p (reg_class_contents[from_i],
13028                                 reg_class_contents[FFR_REGS]))
13029     return 80;
13030
13031   /* Moving between GPR and stack cost is the same as GP2GP.  */
13032   if ((from == GENERAL_REGS && to == STACK_REG)
13033       || (to == GENERAL_REGS && from == STACK_REG))
13034     return regmove_cost->GP2GP;
13035
13036   /* To/From the stack register, we move via the gprs.  */
13037   if (to == STACK_REG || from == STACK_REG)
13038     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
13039             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
13040
13041   if (known_eq (GET_MODE_SIZE (mode), 16))
13042     {
13043       /* 128-bit operations on general registers require 2 instructions.  */
13044       if (from == GENERAL_REGS && to == GENERAL_REGS)
13045         return regmove_cost->GP2GP * 2;
13046       else if (from == GENERAL_REGS)
13047         return regmove_cost->GP2FP * 2;
13048       else if (to == GENERAL_REGS)
13049         return regmove_cost->FP2GP * 2;
13050
13051       /* When AdvSIMD instructions are disabled it is not possible to move
13052          a 128-bit value directly between Q registers.  This is handled in
13053          secondary reload.  A general register is used as a scratch to move
13054          the upper DI value and the lower DI value is moved directly,
13055          hence the cost is the sum of three moves. */
13056       if (! TARGET_SIMD)
13057         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
13058
13059       return regmove_cost->FP2FP;
13060     }
13061
13062   if (from == GENERAL_REGS && to == GENERAL_REGS)
13063     return regmove_cost->GP2GP;
13064   else if (from == GENERAL_REGS)
13065     return regmove_cost->GP2FP;
13066   else if (to == GENERAL_REGS)
13067     return regmove_cost->FP2GP;
13068
13069   return regmove_cost->FP2FP;
13070 }
13071
13072 static int
13073 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
13074                           reg_class_t rclass ATTRIBUTE_UNUSED,
13075                           bool in ATTRIBUTE_UNUSED)
13076 {
13077   return aarch64_tune_params.memmov_cost;
13078 }
13079
13080 /* Implement TARGET_INIT_BUILTINS.  */
13081 static void
13082 aarch64_init_builtins ()
13083 {
13084   aarch64_general_init_builtins ();
13085   aarch64_sve::init_builtins ();
13086 }
13087
13088 /* Implement TARGET_FOLD_BUILTIN.  */
13089 static tree
13090 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
13091 {
13092   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13093   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13094   tree type = TREE_TYPE (TREE_TYPE (fndecl));
13095   switch (code & AARCH64_BUILTIN_CLASS)
13096     {
13097     case AARCH64_BUILTIN_GENERAL:
13098       return aarch64_general_fold_builtin (subcode, type, nargs, args);
13099
13100     case AARCH64_BUILTIN_SVE:
13101       return NULL_TREE;
13102     }
13103   gcc_unreachable ();
13104 }
13105
13106 /* Implement TARGET_GIMPLE_FOLD_BUILTIN.  */
13107 static bool
13108 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
13109 {
13110   gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
13111   tree fndecl = gimple_call_fndecl (stmt);
13112   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13113   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13114   gimple *new_stmt = NULL;
13115   switch (code & AARCH64_BUILTIN_CLASS)
13116     {
13117     case AARCH64_BUILTIN_GENERAL:
13118       new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt);
13119       break;
13120
13121     case AARCH64_BUILTIN_SVE:
13122       new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
13123       break;
13124     }
13125
13126   if (!new_stmt)
13127     return false;
13128
13129   gsi_replace (gsi, new_stmt, true);
13130   return true;
13131 }
13132
13133 /* Implement TARGET_EXPAND_BUILTIN.  */
13134 static rtx
13135 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
13136 {
13137   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
13138   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13139   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13140   switch (code & AARCH64_BUILTIN_CLASS)
13141     {
13142     case AARCH64_BUILTIN_GENERAL:
13143       return aarch64_general_expand_builtin (subcode, exp, target, ignore);
13144
13145     case AARCH64_BUILTIN_SVE:
13146       return aarch64_sve::expand_builtin (subcode, exp, target);
13147     }
13148   gcc_unreachable ();
13149 }
13150
13151 /* Implement TARGET_BUILTIN_DECL.  */
13152 static tree
13153 aarch64_builtin_decl (unsigned int code, bool initialize_p)
13154 {
13155   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13156   switch (code & AARCH64_BUILTIN_CLASS)
13157     {
13158     case AARCH64_BUILTIN_GENERAL:
13159       return aarch64_general_builtin_decl (subcode, initialize_p);
13160
13161     case AARCH64_BUILTIN_SVE:
13162       return aarch64_sve::builtin_decl (subcode, initialize_p);
13163     }
13164   gcc_unreachable ();
13165 }
13166
13167 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
13168    to optimize 1.0/sqrt.  */
13169
13170 static bool
13171 use_rsqrt_p (machine_mode mode)
13172 {
13173   return (!flag_trapping_math
13174           && flag_unsafe_math_optimizations
13175           && ((aarch64_tune_params.approx_modes->recip_sqrt
13176                & AARCH64_APPROX_MODE (mode))
13177               || flag_mrecip_low_precision_sqrt));
13178 }
13179
13180 /* Function to decide when to use the approximate reciprocal square root
13181    builtin.  */
13182
13183 static tree
13184 aarch64_builtin_reciprocal (tree fndecl)
13185 {
13186   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
13187
13188   if (!use_rsqrt_p (mode))
13189     return NULL_TREE;
13190   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13191   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13192   switch (code & AARCH64_BUILTIN_CLASS)
13193     {
13194     case AARCH64_BUILTIN_GENERAL:
13195       return aarch64_general_builtin_rsqrt (subcode);
13196
13197     case AARCH64_BUILTIN_SVE:
13198       return NULL_TREE;
13199     }
13200   gcc_unreachable ();
13201 }
13202
13203 /* Emit code to perform the floating-point operation:
13204
13205      DST = SRC1 * SRC2
13206
13207    where all three operands are already known to be registers.
13208    If the operation is an SVE one, PTRUE is a suitable all-true
13209    predicate.  */
13210
13211 static void
13212 aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
13213 {
13214   if (ptrue)
13215     emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
13216                                  dst, ptrue, src1, src2,
13217                                  gen_int_mode (SVE_RELAXED_GP, SImode)));
13218   else
13219     emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
13220 }
13221
13222 /* Emit instruction sequence to compute either the approximate square root
13223    or its approximate reciprocal, depending on the flag RECP, and return
13224    whether the sequence was emitted or not.  */
13225
13226 bool
13227 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
13228 {
13229   machine_mode mode = GET_MODE (dst);
13230
13231   if (GET_MODE_INNER (mode) == HFmode)
13232     {
13233       gcc_assert (!recp);
13234       return false;
13235     }
13236
13237   if (!recp)
13238     {
13239       if (!(flag_mlow_precision_sqrt
13240             || (aarch64_tune_params.approx_modes->sqrt
13241                 & AARCH64_APPROX_MODE (mode))))
13242         return false;
13243
13244       if (!flag_finite_math_only
13245           || flag_trapping_math
13246           || !flag_unsafe_math_optimizations
13247           || optimize_function_for_size_p (cfun))
13248         return false;
13249     }
13250   else
13251     /* Caller assumes we cannot fail.  */
13252     gcc_assert (use_rsqrt_p (mode));
13253
13254   rtx pg = NULL_RTX;
13255   if (aarch64_sve_mode_p (mode))
13256     pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
13257   machine_mode mmsk = (VECTOR_MODE_P (mode)
13258                        ? related_int_vector_mode (mode).require ()
13259                        : int_mode_for_mode (mode).require ());
13260   rtx xmsk = NULL_RTX;
13261   if (!recp)
13262     {
13263       /* When calculating the approximate square root, compare the
13264          argument with 0.0 and create a mask.  */
13265       rtx zero = CONST0_RTX (mode);
13266       if (pg)
13267         {
13268           xmsk = gen_reg_rtx (GET_MODE (pg));
13269           rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
13270           emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
13271                                            xmsk, pg, hint, src, zero));
13272         }
13273       else
13274         {
13275           xmsk = gen_reg_rtx (mmsk);
13276           emit_insn (gen_rtx_SET (xmsk,
13277                                   gen_rtx_NEG (mmsk,
13278                                                gen_rtx_EQ (mmsk, src, zero))));
13279         }
13280     }
13281
13282   /* Estimate the approximate reciprocal square root.  */
13283   rtx xdst = gen_reg_rtx (mode);
13284   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
13285
13286   /* Iterate over the series twice for SF and thrice for DF.  */
13287   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
13288
13289   /* Optionally iterate over the series once less for faster performance
13290      while sacrificing the accuracy.  */
13291   if ((recp && flag_mrecip_low_precision_sqrt)
13292       || (!recp && flag_mlow_precision_sqrt))
13293     iterations--;
13294
13295   /* Iterate over the series to calculate the approximate reciprocal square
13296      root.  */
13297   rtx x1 = gen_reg_rtx (mode);
13298   while (iterations--)
13299     {
13300       rtx x2 = gen_reg_rtx (mode);
13301       aarch64_emit_mult (x2, pg, xdst, xdst);
13302
13303       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
13304
13305       if (iterations > 0)
13306         aarch64_emit_mult (xdst, pg, xdst, x1);
13307     }
13308
13309   if (!recp)
13310     {
13311       if (pg)
13312         /* Multiply nonzero source values by the corresponding intermediate
13313            result elements, so that the final calculation is the approximate
13314            square root rather than its reciprocal.  Select a zero result for
13315            zero source values, to avoid the Inf * 0 -> NaN that we'd get
13316            otherwise.  */
13317         emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
13318                              xdst, xmsk, xdst, src, CONST0_RTX (mode)));
13319       else
13320         {
13321           /* Qualify the approximate reciprocal square root when the
13322              argument is 0.0 by squashing the intermediary result to 0.0.  */
13323           rtx xtmp = gen_reg_rtx (mmsk);
13324           emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
13325                                             gen_rtx_SUBREG (mmsk, xdst, 0)));
13326           emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
13327
13328           /* Calculate the approximate square root.  */
13329           aarch64_emit_mult (xdst, pg, xdst, src);
13330         }
13331     }
13332
13333   /* Finalize the approximation.  */
13334   aarch64_emit_mult (dst, pg, xdst, x1);
13335
13336   return true;
13337 }
13338
13339 /* Emit the instruction sequence to compute the approximation for the division
13340    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
13341
13342 bool
13343 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
13344 {
13345   machine_mode mode = GET_MODE (quo);
13346
13347   if (GET_MODE_INNER (mode) == HFmode)
13348     return false;
13349
13350   bool use_approx_division_p = (flag_mlow_precision_div
13351                                 || (aarch64_tune_params.approx_modes->division
13352                                     & AARCH64_APPROX_MODE (mode)));
13353
13354   if (!flag_finite_math_only
13355       || flag_trapping_math
13356       || !flag_unsafe_math_optimizations
13357       || optimize_function_for_size_p (cfun)
13358       || !use_approx_division_p)
13359     return false;
13360
13361   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
13362     return false;
13363
13364   rtx pg = NULL_RTX;
13365   if (aarch64_sve_mode_p (mode))
13366     pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
13367
13368   /* Estimate the approximate reciprocal.  */
13369   rtx xrcp = gen_reg_rtx (mode);
13370   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
13371
13372   /* Iterate over the series twice for SF and thrice for DF.  */
13373   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
13374
13375   /* Optionally iterate over the series less for faster performance,
13376      while sacrificing the accuracy.  The default is 2 for DF and 1 for SF.  */
13377   if (flag_mlow_precision_div)
13378     iterations = (GET_MODE_INNER (mode) == DFmode
13379                   ? aarch64_double_recp_precision
13380                   : aarch64_float_recp_precision);
13381
13382   /* Iterate over the series to calculate the approximate reciprocal.  */
13383   rtx xtmp = gen_reg_rtx (mode);
13384   while (iterations--)
13385     {
13386       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
13387
13388       if (iterations > 0)
13389         aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
13390     }
13391
13392   if (num != CONST1_RTX (mode))
13393     {
13394       /* As the approximate reciprocal of DEN is already calculated, only
13395          calculate the approximate division when NUM is not 1.0.  */
13396       rtx xnum = force_reg (mode, num);
13397       aarch64_emit_mult (xrcp, pg, xrcp, xnum);
13398     }
13399
13400   /* Finalize the approximation.  */
13401   aarch64_emit_mult (quo, pg, xrcp, xtmp);
13402   return true;
13403 }
13404
13405 /* Return the number of instructions that can be issued per cycle.  */
13406 static int
13407 aarch64_sched_issue_rate (void)
13408 {
13409   return aarch64_tune_params.issue_rate;
13410 }
13411
13412 /* Implement TARGET_SCHED_VARIABLE_ISSUE.  */
13413 static int
13414 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
13415 {
13416   if (DEBUG_INSN_P (insn))
13417     return more;
13418
13419   rtx_code code = GET_CODE (PATTERN (insn));
13420   if (code == USE || code == CLOBBER)
13421     return more;
13422
13423   if (get_attr_type (insn) == TYPE_NO_INSN)
13424     return more;
13425
13426   return more - 1;
13427 }
13428
13429 static int
13430 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
13431 {
13432   int issue_rate = aarch64_sched_issue_rate ();
13433
13434   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
13435 }
13436
13437
13438 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
13439    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
13440    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
13441
13442 static int
13443 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
13444                                                     int ready_index)
13445 {
13446   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
13447 }
13448
13449
13450 /* Vectorizer cost model target hooks.  */
13451
13452 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
13453 static int
13454 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
13455                                     tree vectype,
13456                                     int misalign ATTRIBUTE_UNUSED)
13457 {
13458   unsigned elements;
13459   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
13460   bool fp = false;
13461
13462   if (vectype != NULL)
13463     fp = FLOAT_TYPE_P (vectype);
13464
13465   switch (type_of_cost)
13466     {
13467       case scalar_stmt:
13468         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
13469
13470       case scalar_load:
13471         return costs->scalar_load_cost;
13472
13473       case scalar_store:
13474         return costs->scalar_store_cost;
13475
13476       case vector_stmt:
13477         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
13478
13479       case vector_load:
13480         return costs->vec_align_load_cost;
13481
13482       case vector_store:
13483         return costs->vec_store_cost;
13484
13485       case vec_to_scalar:
13486         return costs->vec_to_scalar_cost;
13487
13488       case scalar_to_vec:
13489         return costs->scalar_to_vec_cost;
13490
13491       case unaligned_load:
13492       case vector_gather_load:
13493         return costs->vec_unalign_load_cost;
13494
13495       case unaligned_store:
13496       case vector_scatter_store:
13497         return costs->vec_unalign_store_cost;
13498
13499       case cond_branch_taken:
13500         return costs->cond_taken_branch_cost;
13501
13502       case cond_branch_not_taken:
13503         return costs->cond_not_taken_branch_cost;
13504
13505       case vec_perm:
13506         return costs->vec_permute_cost;
13507
13508       case vec_promote_demote:
13509         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
13510
13511       case vec_construct:
13512         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
13513         return elements / 2 + 1;
13514
13515       default:
13516         gcc_unreachable ();
13517     }
13518 }
13519
13520 /* Return true if STMT_INFO extends the result of a load.  */
13521 static bool
13522 aarch64_extending_load_p (stmt_vec_info stmt_info)
13523 {
13524   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
13525   if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
13526     return false;
13527
13528   tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
13529   tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
13530   tree rhs_type = TREE_TYPE (rhs);
13531   if (!INTEGRAL_TYPE_P (lhs_type)
13532       || !INTEGRAL_TYPE_P (rhs_type)
13533       || TYPE_PRECISION (lhs_type) <= TYPE_PRECISION (rhs_type))
13534     return false;
13535
13536   stmt_vec_info def_stmt_info = stmt_info->vinfo->lookup_def (rhs);
13537   return (def_stmt_info
13538           && STMT_VINFO_DATA_REF (def_stmt_info)
13539           && DR_IS_READ (STMT_VINFO_DATA_REF (def_stmt_info)));
13540 }
13541
13542 /* Return true if STMT_INFO is an integer truncation.  */
13543 static bool
13544 aarch64_integer_truncation_p (stmt_vec_info stmt_info)
13545 {
13546   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
13547   if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
13548     return false;
13549
13550   tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
13551   tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
13552   return (INTEGRAL_TYPE_P (lhs_type)
13553           && INTEGRAL_TYPE_P (rhs_type)
13554           && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type));
13555 }
13556
13557 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
13558    for STMT_INFO, which has cost kind KIND.  Adjust the cost as necessary
13559    for SVE targets.  */
13560 static unsigned int
13561 aarch64_sve_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
13562                               unsigned int stmt_cost)
13563 {
13564   /* Unlike vec_promote_demote, vector_stmt conversions do not change the
13565      vector register size or number of units.  Integer promotions of this
13566      type therefore map to SXT[BHW] or UXT[BHW].
13567
13568      Most loads have extending forms that can do the sign or zero extension
13569      on the fly.  Optimistically assume that a load followed by an extension
13570      will fold to this form during combine, and that the extension therefore
13571      comes for free.  */
13572   if (kind == vector_stmt && aarch64_extending_load_p (stmt_info))
13573     stmt_cost = 0;
13574
13575   /* For similar reasons, vector_stmt integer truncations are a no-op,
13576      because we can just ignore the unused upper bits of the source.  */
13577   if (kind == vector_stmt && aarch64_integer_truncation_p (stmt_info))
13578     stmt_cost = 0;
13579
13580   return stmt_cost;
13581 }
13582
13583 /* Implement targetm.vectorize.add_stmt_cost.  */
13584 static unsigned
13585 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
13586                        struct _stmt_vec_info *stmt_info, int misalign,
13587                        enum vect_cost_model_location where)
13588 {
13589   unsigned *cost = (unsigned *) data;
13590   unsigned retval = 0;
13591
13592   if (flag_vect_cost_model)
13593     {
13594       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
13595       int stmt_cost =
13596             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
13597
13598       if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
13599         stmt_cost = aarch64_sve_adjust_stmt_cost (kind, stmt_info, stmt_cost);
13600
13601       /* Statements in an inner loop relative to the loop being
13602          vectorized are weighted more heavily.  The value here is
13603          arbitrary and could potentially be improved with analysis.  */
13604       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
13605         count *= 50; /*  FIXME  */
13606
13607       retval = (unsigned) (count * stmt_cost);
13608       cost[where] += retval;
13609     }
13610
13611   return retval;
13612 }
13613
13614 static void initialize_aarch64_code_model (struct gcc_options *);
13615
13616 /* Parse the TO_PARSE string and put the architecture struct that it
13617    selects into RES and the architectural features into ISA_FLAGS.
13618    Return an aarch64_parse_opt_result describing the parse result.
13619    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
13620    When the TO_PARSE string contains an invalid extension,
13621    a copy of the string is created and stored to INVALID_EXTENSION.  */
13622
13623 static enum aarch64_parse_opt_result
13624 aarch64_parse_arch (const char *to_parse, const struct processor **res,
13625                     uint64_t *isa_flags, std::string *invalid_extension)
13626 {
13627   const char *ext;
13628   const struct processor *arch;
13629   size_t len;
13630
13631   ext = strchr (to_parse, '+');
13632
13633   if (ext != NULL)
13634     len = ext - to_parse;
13635   else
13636     len = strlen (to_parse);
13637
13638   if (len == 0)
13639     return AARCH64_PARSE_MISSING_ARG;
13640
13641
13642   /* Loop through the list of supported ARCHes to find a match.  */
13643   for (arch = all_architectures; arch->name != NULL; arch++)
13644     {
13645       if (strlen (arch->name) == len
13646           && strncmp (arch->name, to_parse, len) == 0)
13647         {
13648           uint64_t isa_temp = arch->flags;
13649
13650           if (ext != NULL)
13651             {
13652               /* TO_PARSE string contains at least one extension.  */
13653               enum aarch64_parse_opt_result ext_res
13654                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
13655
13656               if (ext_res != AARCH64_PARSE_OK)
13657                 return ext_res;
13658             }
13659           /* Extension parsing was successful.  Confirm the result
13660              arch and ISA flags.  */
13661           *res = arch;
13662           *isa_flags = isa_temp;
13663           return AARCH64_PARSE_OK;
13664         }
13665     }
13666
13667   /* ARCH name not found in list.  */
13668   return AARCH64_PARSE_INVALID_ARG;
13669 }
13670
13671 /* Parse the TO_PARSE string and put the result tuning in RES and the
13672    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
13673    describing the parse result.  If there is an error parsing, RES and
13674    ISA_FLAGS are left unchanged.
13675    When the TO_PARSE string contains an invalid extension,
13676    a copy of the string is created and stored to INVALID_EXTENSION.  */
13677
13678 static enum aarch64_parse_opt_result
13679 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
13680                    uint64_t *isa_flags, std::string *invalid_extension)
13681 {
13682   const char *ext;
13683   const struct processor *cpu;
13684   size_t len;
13685
13686   ext = strchr (to_parse, '+');
13687
13688   if (ext != NULL)
13689     len = ext - to_parse;
13690   else
13691     len = strlen (to_parse);
13692
13693   if (len == 0)
13694     return AARCH64_PARSE_MISSING_ARG;
13695
13696
13697   /* Loop through the list of supported CPUs to find a match.  */
13698   for (cpu = all_cores; cpu->name != NULL; cpu++)
13699     {
13700       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
13701         {
13702           uint64_t isa_temp = cpu->flags;
13703
13704
13705           if (ext != NULL)
13706             {
13707               /* TO_PARSE string contains at least one extension.  */
13708               enum aarch64_parse_opt_result ext_res
13709                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
13710
13711               if (ext_res != AARCH64_PARSE_OK)
13712                 return ext_res;
13713             }
13714           /* Extension parsing was successfull.  Confirm the result
13715              cpu and ISA flags.  */
13716           *res = cpu;
13717           *isa_flags = isa_temp;
13718           return AARCH64_PARSE_OK;
13719         }
13720     }
13721
13722   /* CPU name not found in list.  */
13723   return AARCH64_PARSE_INVALID_ARG;
13724 }
13725
13726 /* Parse the TO_PARSE string and put the cpu it selects into RES.
13727    Return an aarch64_parse_opt_result describing the parse result.
13728    If the parsing fails the RES does not change.  */
13729
13730 static enum aarch64_parse_opt_result
13731 aarch64_parse_tune (const char *to_parse, const struct processor **res)
13732 {
13733   const struct processor *cpu;
13734
13735   /* Loop through the list of supported CPUs to find a match.  */
13736   for (cpu = all_cores; cpu->name != NULL; cpu++)
13737     {
13738       if (strcmp (cpu->name, to_parse) == 0)
13739         {
13740           *res = cpu;
13741           return AARCH64_PARSE_OK;
13742         }
13743     }
13744
13745   /* CPU name not found in list.  */
13746   return AARCH64_PARSE_INVALID_ARG;
13747 }
13748
13749 /* Parse TOKEN, which has length LENGTH to see if it is an option
13750    described in FLAG.  If it is, return the index bit for that fusion type.
13751    If not, error (printing OPTION_NAME) and return zero.  */
13752
13753 static unsigned int
13754 aarch64_parse_one_option_token (const char *token,
13755                                 size_t length,
13756                                 const struct aarch64_flag_desc *flag,
13757                                 const char *option_name)
13758 {
13759   for (; flag->name != NULL; flag++)
13760     {
13761       if (length == strlen (flag->name)
13762           && !strncmp (flag->name, token, length))
13763         return flag->flag;
13764     }
13765
13766   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
13767   return 0;
13768 }
13769
13770 /* Parse OPTION which is a comma-separated list of flags to enable.
13771    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
13772    default state we inherit from the CPU tuning structures.  OPTION_NAME
13773    gives the top-level option we are parsing in the -moverride string,
13774    for use in error messages.  */
13775
13776 static unsigned int
13777 aarch64_parse_boolean_options (const char *option,
13778                                const struct aarch64_flag_desc *flags,
13779                                unsigned int initial_state,
13780                                const char *option_name)
13781 {
13782   const char separator = '.';
13783   const char* specs = option;
13784   const char* ntoken = option;
13785   unsigned int found_flags = initial_state;
13786
13787   while ((ntoken = strchr (specs, separator)))
13788     {
13789       size_t token_length = ntoken - specs;
13790       unsigned token_ops = aarch64_parse_one_option_token (specs,
13791                                                            token_length,
13792                                                            flags,
13793                                                            option_name);
13794       /* If we find "none" (or, for simplicity's sake, an error) anywhere
13795          in the token stream, reset the supported operations.  So:
13796
13797            adrp+add.cmp+branch.none.adrp+add
13798
13799            would have the result of turning on only adrp+add fusion.  */
13800       if (!token_ops)
13801         found_flags = 0;
13802
13803       found_flags |= token_ops;
13804       specs = ++ntoken;
13805     }
13806
13807   /* We ended with a comma, print something.  */
13808   if (!(*specs))
13809     {
13810       error ("%s string ill-formed\n", option_name);
13811       return 0;
13812     }
13813
13814   /* We still have one more token to parse.  */
13815   size_t token_length = strlen (specs);
13816   unsigned token_ops = aarch64_parse_one_option_token (specs,
13817                                                        token_length,
13818                                                        flags,
13819                                                        option_name);
13820    if (!token_ops)
13821      found_flags = 0;
13822
13823   found_flags |= token_ops;
13824   return found_flags;
13825 }
13826
13827 /* Support for overriding instruction fusion.  */
13828
13829 static void
13830 aarch64_parse_fuse_string (const char *fuse_string,
13831                             struct tune_params *tune)
13832 {
13833   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
13834                                                      aarch64_fusible_pairs,
13835                                                      tune->fusible_ops,
13836                                                      "fuse=");
13837 }
13838
13839 /* Support for overriding other tuning flags.  */
13840
13841 static void
13842 aarch64_parse_tune_string (const char *tune_string,
13843                             struct tune_params *tune)
13844 {
13845   tune->extra_tuning_flags
13846     = aarch64_parse_boolean_options (tune_string,
13847                                      aarch64_tuning_flags,
13848                                      tune->extra_tuning_flags,
13849                                      "tune=");
13850 }
13851
13852 /* Parse the sve_width tuning moverride string in TUNE_STRING.
13853    Accept the valid SVE vector widths allowed by
13854    aarch64_sve_vector_bits_enum and use it to override sve_width
13855    in TUNE.  */
13856
13857 static void
13858 aarch64_parse_sve_width_string (const char *tune_string,
13859                                 struct tune_params *tune)
13860 {
13861   int width = -1;
13862
13863   int n = sscanf (tune_string, "%d", &width);
13864   if (n == EOF)
13865     {
13866       error ("invalid format for sve_width");
13867       return;
13868     }
13869   switch (width)
13870     {
13871     case SVE_128:
13872     case SVE_256:
13873     case SVE_512:
13874     case SVE_1024:
13875     case SVE_2048:
13876       break;
13877     default:
13878       error ("invalid sve_width value: %d", width);
13879     }
13880   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
13881 }
13882
13883 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
13884    we understand.  If it is, extract the option string and handoff to
13885    the appropriate function.  */
13886
13887 void
13888 aarch64_parse_one_override_token (const char* token,
13889                                   size_t length,
13890                                   struct tune_params *tune)
13891 {
13892   const struct aarch64_tuning_override_function *fn
13893     = aarch64_tuning_override_functions;
13894
13895   const char *option_part = strchr (token, '=');
13896   if (!option_part)
13897     {
13898       error ("tuning string missing in option (%s)", token);
13899       return;
13900     }
13901
13902   /* Get the length of the option name.  */
13903   length = option_part - token;
13904   /* Skip the '=' to get to the option string.  */
13905   option_part++;
13906
13907   for (; fn->name != NULL; fn++)
13908     {
13909       if (!strncmp (fn->name, token, length))
13910         {
13911           fn->parse_override (option_part, tune);
13912           return;
13913         }
13914     }
13915
13916   error ("unknown tuning option (%s)",token);
13917   return;
13918 }
13919
13920 /* A checking mechanism for the implementation of the tls size.  */
13921
13922 static void
13923 initialize_aarch64_tls_size (struct gcc_options *opts)
13924 {
13925   if (aarch64_tls_size == 0)
13926     aarch64_tls_size = 24;
13927
13928   switch (opts->x_aarch64_cmodel_var)
13929     {
13930     case AARCH64_CMODEL_TINY:
13931       /* Both the default and maximum TLS size allowed under tiny is 1M which
13932          needs two instructions to address, so we clamp the size to 24.  */
13933       if (aarch64_tls_size > 24)
13934         aarch64_tls_size = 24;
13935       break;
13936     case AARCH64_CMODEL_SMALL:
13937       /* The maximum TLS size allowed under small is 4G.  */
13938       if (aarch64_tls_size > 32)
13939         aarch64_tls_size = 32;
13940       break;
13941     case AARCH64_CMODEL_LARGE:
13942       /* The maximum TLS size allowed under large is 16E.
13943          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
13944       if (aarch64_tls_size > 48)
13945         aarch64_tls_size = 48;
13946       break;
13947     default:
13948       gcc_unreachable ();
13949     }
13950
13951   return;
13952 }
13953
13954 /* Parse STRING looking for options in the format:
13955      string     :: option:string
13956      option     :: name=substring
13957      name       :: {a-z}
13958      substring  :: defined by option.  */
13959
13960 static void
13961 aarch64_parse_override_string (const char* input_string,
13962                                struct tune_params* tune)
13963 {
13964   const char separator = ':';
13965   size_t string_length = strlen (input_string) + 1;
13966   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
13967   char *string = string_root;
13968   strncpy (string, input_string, string_length);
13969   string[string_length - 1] = '\0';
13970
13971   char* ntoken = string;
13972
13973   while ((ntoken = strchr (string, separator)))
13974     {
13975       size_t token_length = ntoken - string;
13976       /* Make this substring look like a string.  */
13977       *ntoken = '\0';
13978       aarch64_parse_one_override_token (string, token_length, tune);
13979       string = ++ntoken;
13980     }
13981
13982   /* One last option to parse.  */
13983   aarch64_parse_one_override_token (string, strlen (string), tune);
13984   free (string_root);
13985 }
13986
13987
13988 static void
13989 aarch64_override_options_after_change_1 (struct gcc_options *opts)
13990 {
13991   if (accepted_branch_protection_string)
13992     {
13993       opts->x_aarch64_branch_protection_string
13994         = xstrdup (accepted_branch_protection_string);
13995     }
13996
13997   /* PR 70044: We have to be careful about being called multiple times for the
13998      same function.  This means all changes should be repeatable.  */
13999
14000   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
14001      Disable the frame pointer flag so the mid-end will not use a frame
14002      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
14003      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
14004      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
14005   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
14006   if (opts->x_flag_omit_frame_pointer == 0)
14007     opts->x_flag_omit_frame_pointer = 2;
14008
14009   /* If not optimizing for size, set the default
14010      alignment to what the target wants.  */
14011   if (!opts->x_optimize_size)
14012     {
14013       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
14014         opts->x_str_align_loops = aarch64_tune_params.loop_align;
14015       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
14016         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
14017       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
14018         opts->x_str_align_functions = aarch64_tune_params.function_align;
14019     }
14020
14021   /* We default to no pc-relative literal loads.  */
14022
14023   aarch64_pcrelative_literal_loads = false;
14024
14025   /* If -mpc-relative-literal-loads is set on the command line, this
14026      implies that the user asked for PC relative literal loads.  */
14027   if (opts->x_pcrelative_literal_loads == 1)
14028     aarch64_pcrelative_literal_loads = true;
14029
14030   /* In the tiny memory model it makes no sense to disallow PC relative
14031      literal pool loads.  */
14032   if (aarch64_cmodel == AARCH64_CMODEL_TINY
14033       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
14034     aarch64_pcrelative_literal_loads = true;
14035
14036   /* When enabling the lower precision Newton series for the square root, also
14037      enable it for the reciprocal square root, since the latter is an
14038      intermediary step for the former.  */
14039   if (flag_mlow_precision_sqrt)
14040     flag_mrecip_low_precision_sqrt = true;
14041 }
14042
14043 /* 'Unpack' up the internal tuning structs and update the options
14044     in OPTS.  The caller must have set up selected_tune and selected_arch
14045     as all the other target-specific codegen decisions are
14046     derived from them.  */
14047
14048 void
14049 aarch64_override_options_internal (struct gcc_options *opts)
14050 {
14051   aarch64_tune_flags = selected_tune->flags;
14052   aarch64_tune = selected_tune->sched_core;
14053   /* Make a copy of the tuning parameters attached to the core, which
14054      we may later overwrite.  */
14055   aarch64_tune_params = *(selected_tune->tune);
14056   aarch64_architecture_version = selected_arch->architecture_version;
14057
14058   if (opts->x_aarch64_override_tune_string)
14059     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
14060                                   &aarch64_tune_params);
14061
14062   /* This target defaults to strict volatile bitfields.  */
14063   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
14064     opts->x_flag_strict_volatile_bitfields = 1;
14065
14066   if (aarch64_stack_protector_guard == SSP_GLOBAL
14067       && opts->x_aarch64_stack_protector_guard_offset_str)
14068     {
14069       error ("incompatible options %<-mstack-protector-guard=global%> and "
14070              "%<-mstack-protector-guard-offset=%s%>",
14071              aarch64_stack_protector_guard_offset_str);
14072     }
14073
14074   if (aarch64_stack_protector_guard == SSP_SYSREG
14075       && !(opts->x_aarch64_stack_protector_guard_offset_str
14076            && opts->x_aarch64_stack_protector_guard_reg_str))
14077     {
14078       error ("both %<-mstack-protector-guard-offset%> and "
14079              "%<-mstack-protector-guard-reg%> must be used "
14080              "with %<-mstack-protector-guard=sysreg%>");
14081     }
14082
14083   if (opts->x_aarch64_stack_protector_guard_reg_str)
14084     {
14085       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
14086           error ("specify a system register with a small string length.");
14087     }
14088
14089   if (opts->x_aarch64_stack_protector_guard_offset_str)
14090     {
14091       char *end;
14092       const char *str = aarch64_stack_protector_guard_offset_str;
14093       errno = 0;
14094       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
14095       if (!*str || *end || errno)
14096         error ("%qs is not a valid offset in %qs", str,
14097                "-mstack-protector-guard-offset=");
14098       aarch64_stack_protector_guard_offset = offs;
14099     }
14100
14101   initialize_aarch64_code_model (opts);
14102   initialize_aarch64_tls_size (opts);
14103
14104   int queue_depth = 0;
14105   switch (aarch64_tune_params.autoprefetcher_model)
14106     {
14107       case tune_params::AUTOPREFETCHER_OFF:
14108         queue_depth = -1;
14109         break;
14110       case tune_params::AUTOPREFETCHER_WEAK:
14111         queue_depth = 0;
14112         break;
14113       case tune_params::AUTOPREFETCHER_STRONG:
14114         queue_depth = max_insn_queue_index + 1;
14115         break;
14116       default:
14117         gcc_unreachable ();
14118     }
14119
14120   /* We don't mind passing in global_options_set here as we don't use
14121      the *options_set structs anyway.  */
14122   SET_OPTION_IF_UNSET (opts, &global_options_set,
14123                        param_sched_autopref_queue_depth, queue_depth);
14124
14125   /* Set up parameters to be used in prefetching algorithm.  Do not
14126      override the defaults unless we are tuning for a core we have
14127      researched values for.  */
14128   if (aarch64_tune_params.prefetch->num_slots > 0)
14129     SET_OPTION_IF_UNSET (opts, &global_options_set,
14130                          param_simultaneous_prefetches,
14131                          aarch64_tune_params.prefetch->num_slots);
14132   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
14133     SET_OPTION_IF_UNSET (opts, &global_options_set,
14134                          param_l1_cache_size,
14135                          aarch64_tune_params.prefetch->l1_cache_size);
14136   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
14137     SET_OPTION_IF_UNSET (opts, &global_options_set,
14138                          param_l1_cache_line_size,
14139                          aarch64_tune_params.prefetch->l1_cache_line_size);
14140   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
14141     SET_OPTION_IF_UNSET (opts, &global_options_set,
14142                          param_l2_cache_size,
14143                          aarch64_tune_params.prefetch->l2_cache_size);
14144   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
14145     SET_OPTION_IF_UNSET (opts, &global_options_set,
14146                          param_prefetch_dynamic_strides, 0);
14147   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
14148     SET_OPTION_IF_UNSET (opts, &global_options_set,
14149                          param_prefetch_minimum_stride,
14150                          aarch64_tune_params.prefetch->minimum_stride);
14151
14152   /* Use the alternative scheduling-pressure algorithm by default.  */
14153   SET_OPTION_IF_UNSET (opts, &global_options_set,
14154                        param_sched_pressure_algorithm,
14155                        SCHED_PRESSURE_MODEL);
14156
14157   /* Validate the guard size.  */
14158   int guard_size = param_stack_clash_protection_guard_size;
14159
14160   if (guard_size != 12 && guard_size != 16)
14161     error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
14162            "size.  Given value %d (%llu KB) is out of range",
14163            guard_size, (1ULL << guard_size) / 1024ULL);
14164
14165   /* Enforce that interval is the same size as size so the mid-end does the
14166      right thing.  */
14167   SET_OPTION_IF_UNSET (opts, &global_options_set,
14168                        param_stack_clash_protection_probe_interval,
14169                        guard_size);
14170
14171   /* The maybe_set calls won't update the value if the user has explicitly set
14172      one.  Which means we need to validate that probing interval and guard size
14173      are equal.  */
14174   int probe_interval
14175     = param_stack_clash_protection_probe_interval;
14176   if (guard_size != probe_interval)
14177     error ("stack clash guard size %<%d%> must be equal to probing interval "
14178            "%<%d%>", guard_size, probe_interval);
14179
14180   /* Enable sw prefetching at specified optimization level for
14181      CPUS that have prefetch.  Lower optimization level threshold by 1
14182      when profiling is enabled.  */
14183   if (opts->x_flag_prefetch_loop_arrays < 0
14184       && !opts->x_optimize_size
14185       && aarch64_tune_params.prefetch->default_opt_level >= 0
14186       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
14187     opts->x_flag_prefetch_loop_arrays = 1;
14188
14189   if (opts->x_aarch64_arch_string == NULL)
14190     opts->x_aarch64_arch_string = selected_arch->name;
14191   if (opts->x_aarch64_cpu_string == NULL)
14192     opts->x_aarch64_cpu_string = selected_cpu->name;
14193   if (opts->x_aarch64_tune_string == NULL)
14194     opts->x_aarch64_tune_string = selected_tune->name;
14195
14196   aarch64_override_options_after_change_1 (opts);
14197 }
14198
14199 /* Print a hint with a suggestion for a core or architecture name that
14200    most closely resembles what the user passed in STR.  ARCH is true if
14201    the user is asking for an architecture name.  ARCH is false if the user
14202    is asking for a core name.  */
14203
14204 static void
14205 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
14206 {
14207   auto_vec<const char *> candidates;
14208   const struct processor *entry = arch ? all_architectures : all_cores;
14209   for (; entry->name != NULL; entry++)
14210     candidates.safe_push (entry->name);
14211
14212 #ifdef HAVE_LOCAL_CPU_DETECT
14213   /* Add also "native" as possible value.  */
14214   if (arch)
14215     candidates.safe_push ("native");
14216 #endif
14217
14218   char *s;
14219   const char *hint = candidates_list_and_hint (str, s, candidates);
14220   if (hint)
14221     inform (input_location, "valid arguments are: %s;"
14222                              " did you mean %qs?", s, hint);
14223   else
14224     inform (input_location, "valid arguments are: %s", s);
14225
14226   XDELETEVEC (s);
14227 }
14228
14229 /* Print a hint with a suggestion for a core name that most closely resembles
14230    what the user passed in STR.  */
14231
14232 inline static void
14233 aarch64_print_hint_for_core (const char *str)
14234 {
14235   aarch64_print_hint_for_core_or_arch (str, false);
14236 }
14237
14238 /* Print a hint with a suggestion for an architecture name that most closely
14239    resembles what the user passed in STR.  */
14240
14241 inline static void
14242 aarch64_print_hint_for_arch (const char *str)
14243 {
14244   aarch64_print_hint_for_core_or_arch (str, true);
14245 }
14246
14247
14248 /* Print a hint with a suggestion for an extension name
14249    that most closely resembles what the user passed in STR.  */
14250
14251 void
14252 aarch64_print_hint_for_extensions (const std::string &str)
14253 {
14254   auto_vec<const char *> candidates;
14255   aarch64_get_all_extension_candidates (&candidates);
14256   char *s;
14257   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
14258   if (hint)
14259     inform (input_location, "valid arguments are: %s;"
14260                              " did you mean %qs?", s, hint);
14261   else
14262     inform (input_location, "valid arguments are: %s;", s);
14263
14264   XDELETEVEC (s);
14265 }
14266
14267 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
14268    specified in STR and throw errors if appropriate.  Put the results if
14269    they are valid in RES and ISA_FLAGS.  Return whether the option is
14270    valid.  */
14271
14272 static bool
14273 aarch64_validate_mcpu (const char *str, const struct processor **res,
14274                        uint64_t *isa_flags)
14275 {
14276   std::string invalid_extension;
14277   enum aarch64_parse_opt_result parse_res
14278     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
14279
14280   if (parse_res == AARCH64_PARSE_OK)
14281     return true;
14282
14283   switch (parse_res)
14284     {
14285       case AARCH64_PARSE_MISSING_ARG:
14286         error ("missing cpu name in %<-mcpu=%s%>", str);
14287         break;
14288       case AARCH64_PARSE_INVALID_ARG:
14289         error ("unknown value %qs for %<-mcpu%>", str);
14290         aarch64_print_hint_for_core (str);
14291         break;
14292       case AARCH64_PARSE_INVALID_FEATURE:
14293         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
14294                invalid_extension.c_str (), str);
14295         aarch64_print_hint_for_extensions (invalid_extension);
14296         break;
14297       default:
14298         gcc_unreachable ();
14299     }
14300
14301   return false;
14302 }
14303
14304 /* Parses CONST_STR for branch protection features specified in
14305    aarch64_branch_protect_types, and set any global variables required.  Returns
14306    the parsing result and assigns LAST_STR to the last processed token from
14307    CONST_STR so that it can be used for error reporting.  */
14308
14309 static enum
14310 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
14311                                                           char** last_str)
14312 {
14313   char *str_root = xstrdup (const_str);
14314   char* token_save = NULL;
14315   char *str = strtok_r (str_root, "+", &token_save);
14316   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
14317   if (!str)
14318     res = AARCH64_PARSE_MISSING_ARG;
14319   else
14320     {
14321       char *next_str = strtok_r (NULL, "+", &token_save);
14322       /* Reset the branch protection features to their defaults.  */
14323       aarch64_handle_no_branch_protection (NULL, NULL);
14324
14325       while (str && res == AARCH64_PARSE_OK)
14326         {
14327           const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
14328           bool found = false;
14329           /* Search for this type.  */
14330           while (type && type->name && !found && res == AARCH64_PARSE_OK)
14331             {
14332               if (strcmp (str, type->name) == 0)
14333                 {
14334                   found = true;
14335                   res = type->handler (str, next_str);
14336                   str = next_str;
14337                   next_str = strtok_r (NULL, "+", &token_save);
14338                 }
14339               else
14340                 type++;
14341             }
14342           if (found && res == AARCH64_PARSE_OK)
14343             {
14344               bool found_subtype = true;
14345               /* Loop through each token until we find one that isn't a
14346                  subtype.  */
14347               while (found_subtype)
14348                 {
14349                   found_subtype = false;
14350                   const aarch64_branch_protect_type *subtype = type->subtypes;
14351                   /* Search for the subtype.  */
14352                   while (str && subtype && subtype->name && !found_subtype
14353                           && res == AARCH64_PARSE_OK)
14354                     {
14355                       if (strcmp (str, subtype->name) == 0)
14356                         {
14357                           found_subtype = true;
14358                           res = subtype->handler (str, next_str);
14359                           str = next_str;
14360                           next_str = strtok_r (NULL, "+", &token_save);
14361                         }
14362                       else
14363                         subtype++;
14364                     }
14365                 }
14366             }
14367           else if (!found)
14368             res = AARCH64_PARSE_INVALID_ARG;
14369         }
14370     }
14371   /* Copy the last processed token into the argument to pass it back.
14372     Used by option and attribute validation to print the offending token.  */
14373   if (last_str)
14374     {
14375       if (str) strcpy (*last_str, str);
14376       else *last_str = NULL;
14377     }
14378   if (res == AARCH64_PARSE_OK)
14379     {
14380       /* If needed, alloc the accepted string then copy in const_str.
14381         Used by override_option_after_change_1.  */
14382       if (!accepted_branch_protection_string)
14383         accepted_branch_protection_string = (char *) xmalloc (
14384                                                       BRANCH_PROTECT_STR_MAX
14385                                                         + 1);
14386       strncpy (accepted_branch_protection_string, const_str,
14387                 BRANCH_PROTECT_STR_MAX + 1);
14388       /* Forcibly null-terminate.  */
14389       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
14390     }
14391   return res;
14392 }
14393
14394 static bool
14395 aarch64_validate_mbranch_protection (const char *const_str)
14396 {
14397   char *str = (char *) xmalloc (strlen (const_str));
14398   enum aarch64_parse_opt_result res =
14399     aarch64_parse_branch_protection (const_str, &str);
14400   if (res == AARCH64_PARSE_INVALID_ARG)
14401     error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
14402   else if (res == AARCH64_PARSE_MISSING_ARG)
14403     error ("missing argument for %<-mbranch-protection=%>");
14404   free (str);
14405   return res == AARCH64_PARSE_OK;
14406 }
14407
14408 /* Validate a command-line -march option.  Parse the arch and extensions
14409    (if any) specified in STR and throw errors if appropriate.  Put the
14410    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
14411    option is valid.  */
14412
14413 static bool
14414 aarch64_validate_march (const char *str, const struct processor **res,
14415                          uint64_t *isa_flags)
14416 {
14417   std::string invalid_extension;
14418   enum aarch64_parse_opt_result parse_res
14419     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
14420
14421   if (parse_res == AARCH64_PARSE_OK)
14422     return true;
14423
14424   switch (parse_res)
14425     {
14426       case AARCH64_PARSE_MISSING_ARG:
14427         error ("missing arch name in %<-march=%s%>", str);
14428         break;
14429       case AARCH64_PARSE_INVALID_ARG:
14430         error ("unknown value %qs for %<-march%>", str);
14431         aarch64_print_hint_for_arch (str);
14432         break;
14433       case AARCH64_PARSE_INVALID_FEATURE:
14434         error ("invalid feature modifier %qs in %<-march=%s%>",
14435                invalid_extension.c_str (), str);
14436         aarch64_print_hint_for_extensions (invalid_extension);
14437         break;
14438       default:
14439         gcc_unreachable ();
14440     }
14441
14442   return false;
14443 }
14444
14445 /* Validate a command-line -mtune option.  Parse the cpu
14446    specified in STR and throw errors if appropriate.  Put the
14447    result, if it is valid, in RES.  Return whether the option is
14448    valid.  */
14449
14450 static bool
14451 aarch64_validate_mtune (const char *str, const struct processor **res)
14452 {
14453   enum aarch64_parse_opt_result parse_res
14454     = aarch64_parse_tune (str, res);
14455
14456   if (parse_res == AARCH64_PARSE_OK)
14457     return true;
14458
14459   switch (parse_res)
14460     {
14461       case AARCH64_PARSE_MISSING_ARG:
14462         error ("missing cpu name in %<-mtune=%s%>", str);
14463         break;
14464       case AARCH64_PARSE_INVALID_ARG:
14465         error ("unknown value %qs for %<-mtune%>", str);
14466         aarch64_print_hint_for_core (str);
14467         break;
14468       default:
14469         gcc_unreachable ();
14470     }
14471   return false;
14472 }
14473
14474 /* Return the CPU corresponding to the enum CPU.
14475    If it doesn't specify a cpu, return the default.  */
14476
14477 static const struct processor *
14478 aarch64_get_tune_cpu (enum aarch64_processor cpu)
14479 {
14480   if (cpu != aarch64_none)
14481     return &all_cores[cpu];
14482
14483   /* The & 0x3f is to extract the bottom 6 bits that encode the
14484      default cpu as selected by the --with-cpu GCC configure option
14485      in config.gcc.
14486      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
14487      flags mechanism should be reworked to make it more sane.  */
14488   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
14489 }
14490
14491 /* Return the architecture corresponding to the enum ARCH.
14492    If it doesn't specify a valid architecture, return the default.  */
14493
14494 static const struct processor *
14495 aarch64_get_arch (enum aarch64_arch arch)
14496 {
14497   if (arch != aarch64_no_arch)
14498     return &all_architectures[arch];
14499
14500   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
14501
14502   return &all_architectures[cpu->arch];
14503 }
14504
14505 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
14506
14507 static poly_uint16
14508 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
14509 {
14510   /* 128-bit SVE and Advanced SIMD modes use different register layouts
14511      on big-endian targets, so we would need to forbid subregs that convert
14512      from one to the other.  By default a reinterpret sequence would then
14513      involve a store to memory in one mode and a load back in the other.
14514      Even if we optimize that sequence using reverse instructions,
14515      it would still be a significant potential overhead.
14516
14517      For now, it seems better to generate length-agnostic code for that
14518      case instead.  */
14519   if (value == SVE_SCALABLE
14520       || (value == SVE_128 && BYTES_BIG_ENDIAN))
14521     return poly_uint16 (2, 2);
14522   else
14523     return (int) value / 64;
14524 }
14525
14526 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
14527    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
14528    tuning structs.  In particular it must set selected_tune and
14529    aarch64_isa_flags that define the available ISA features and tuning
14530    decisions.  It must also set selected_arch as this will be used to
14531    output the .arch asm tags for each function.  */
14532
14533 static void
14534 aarch64_override_options (void)
14535 {
14536   uint64_t cpu_isa = 0;
14537   uint64_t arch_isa = 0;
14538   aarch64_isa_flags = 0;
14539
14540   bool valid_cpu = true;
14541   bool valid_tune = true;
14542   bool valid_arch = true;
14543
14544   selected_cpu = NULL;
14545   selected_arch = NULL;
14546   selected_tune = NULL;
14547
14548   if (aarch64_branch_protection_string)
14549     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
14550
14551   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
14552      If either of -march or -mtune is given, they override their
14553      respective component of -mcpu.  */
14554   if (aarch64_cpu_string)
14555     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
14556                                         &cpu_isa);
14557
14558   if (aarch64_arch_string)
14559     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
14560                                           &arch_isa);
14561
14562   if (aarch64_tune_string)
14563     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
14564
14565 #ifdef SUBTARGET_OVERRIDE_OPTIONS
14566   SUBTARGET_OVERRIDE_OPTIONS;
14567 #endif
14568
14569   /* If the user did not specify a processor, choose the default
14570      one for them.  This will be the CPU set during configuration using
14571      --with-cpu, otherwise it is "generic".  */
14572   if (!selected_cpu)
14573     {
14574       if (selected_arch)
14575         {
14576           selected_cpu = &all_cores[selected_arch->ident];
14577           aarch64_isa_flags = arch_isa;
14578           explicit_arch = selected_arch->arch;
14579         }
14580       else
14581         {
14582           /* Get default configure-time CPU.  */
14583           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
14584           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
14585         }
14586
14587       if (selected_tune)
14588         explicit_tune_core = selected_tune->ident;
14589     }
14590   /* If both -mcpu and -march are specified check that they are architecturally
14591      compatible, warn if they're not and prefer the -march ISA flags.  */
14592   else if (selected_arch)
14593     {
14594       if (selected_arch->arch != selected_cpu->arch)
14595         {
14596           warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
14597                        aarch64_cpu_string,
14598                        aarch64_arch_string);
14599         }
14600       aarch64_isa_flags = arch_isa;
14601       explicit_arch = selected_arch->arch;
14602       explicit_tune_core = selected_tune ? selected_tune->ident
14603                                           : selected_cpu->ident;
14604     }
14605   else
14606     {
14607       /* -mcpu but no -march.  */
14608       aarch64_isa_flags = cpu_isa;
14609       explicit_tune_core = selected_tune ? selected_tune->ident
14610                                           : selected_cpu->ident;
14611       gcc_assert (selected_cpu);
14612       selected_arch = &all_architectures[selected_cpu->arch];
14613       explicit_arch = selected_arch->arch;
14614     }
14615
14616   /* Set the arch as well as we will need it when outputing
14617      the .arch directive in assembly.  */
14618   if (!selected_arch)
14619     {
14620       gcc_assert (selected_cpu);
14621       selected_arch = &all_architectures[selected_cpu->arch];
14622     }
14623
14624   if (!selected_tune)
14625     selected_tune = selected_cpu;
14626
14627   if (aarch64_enable_bti == 2)
14628     {
14629 #ifdef TARGET_ENABLE_BTI
14630       aarch64_enable_bti = 1;
14631 #else
14632       aarch64_enable_bti = 0;
14633 #endif
14634     }
14635
14636   /* Return address signing is currently not supported for ILP32 targets.  For
14637      LP64 targets use the configured option in the absence of a command-line
14638      option for -mbranch-protection.  */
14639   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
14640     {
14641 #ifdef TARGET_ENABLE_PAC_RET
14642       aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
14643 #else
14644       aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
14645 #endif
14646     }
14647
14648 #ifndef HAVE_AS_MABI_OPTION
14649   /* The compiler may have been configured with 2.23.* binutils, which does
14650      not have support for ILP32.  */
14651   if (TARGET_ILP32)
14652     error ("assembler does not support %<-mabi=ilp32%>");
14653 #endif
14654
14655   /* Convert -msve-vector-bits to a VG count.  */
14656   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
14657
14658   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
14659     sorry ("return address signing is only supported for %<-mabi=lp64%>");
14660
14661   /* Make sure we properly set up the explicit options.  */
14662   if ((aarch64_cpu_string && valid_cpu)
14663        || (aarch64_tune_string && valid_tune))
14664     gcc_assert (explicit_tune_core != aarch64_none);
14665
14666   if ((aarch64_cpu_string && valid_cpu)
14667        || (aarch64_arch_string && valid_arch))
14668     gcc_assert (explicit_arch != aarch64_no_arch);
14669
14670   /* The pass to insert speculation tracking runs before
14671      shrink-wrapping and the latter does not know how to update the
14672      tracking status.  So disable it in this case.  */
14673   if (aarch64_track_speculation)
14674     flag_shrink_wrap = 0;
14675
14676   aarch64_override_options_internal (&global_options);
14677
14678   /* Save these options as the default ones in case we push and pop them later
14679      while processing functions with potential target attributes.  */
14680   target_option_default_node = target_option_current_node
14681       = build_target_option_node (&global_options);
14682 }
14683
14684 /* Implement targetm.override_options_after_change.  */
14685
14686 static void
14687 aarch64_override_options_after_change (void)
14688 {
14689   aarch64_override_options_after_change_1 (&global_options);
14690 }
14691
14692 static struct machine_function *
14693 aarch64_init_machine_status (void)
14694 {
14695   struct machine_function *machine;
14696   machine = ggc_cleared_alloc<machine_function> ();
14697   return machine;
14698 }
14699
14700 void
14701 aarch64_init_expanders (void)
14702 {
14703   init_machine_status = aarch64_init_machine_status;
14704 }
14705
14706 /* A checking mechanism for the implementation of the various code models.  */
14707 static void
14708 initialize_aarch64_code_model (struct gcc_options *opts)
14709 {
14710    if (opts->x_flag_pic)
14711      {
14712        switch (opts->x_aarch64_cmodel_var)
14713          {
14714          case AARCH64_CMODEL_TINY:
14715            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
14716            break;
14717          case AARCH64_CMODEL_SMALL:
14718 #ifdef HAVE_AS_SMALL_PIC_RELOCS
14719            aarch64_cmodel = (flag_pic == 2
14720                              ? AARCH64_CMODEL_SMALL_PIC
14721                              : AARCH64_CMODEL_SMALL_SPIC);
14722 #else
14723            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
14724 #endif
14725            break;
14726          case AARCH64_CMODEL_LARGE:
14727            sorry ("code model %qs with %<-f%s%>", "large",
14728                   opts->x_flag_pic > 1 ? "PIC" : "pic");
14729            break;
14730          default:
14731            gcc_unreachable ();
14732          }
14733      }
14734    else
14735      aarch64_cmodel = opts->x_aarch64_cmodel_var;
14736 }
14737
14738 /* Implement TARGET_OPTION_SAVE.  */
14739
14740 static void
14741 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
14742 {
14743   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
14744   ptr->x_aarch64_branch_protection_string
14745     = opts->x_aarch64_branch_protection_string;
14746 }
14747
14748 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
14749    using the information saved in PTR.  */
14750
14751 static void
14752 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
14753 {
14754   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
14755   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
14756   opts->x_explicit_arch = ptr->x_explicit_arch;
14757   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
14758   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
14759   opts->x_aarch64_branch_protection_string
14760     = ptr->x_aarch64_branch_protection_string;
14761   if (opts->x_aarch64_branch_protection_string)
14762     {
14763       aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
14764                                         NULL);
14765     }
14766
14767   aarch64_override_options_internal (opts);
14768 }
14769
14770 /* Implement TARGET_OPTION_PRINT.  */
14771
14772 static void
14773 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
14774 {
14775   const struct processor *cpu
14776     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
14777   uint64_t isa_flags = ptr->x_aarch64_isa_flags;
14778   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
14779   std::string extension
14780     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
14781
14782   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
14783   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
14784            arch->name, extension.c_str ());
14785 }
14786
14787 static GTY(()) tree aarch64_previous_fndecl;
14788
14789 void
14790 aarch64_reset_previous_fndecl (void)
14791 {
14792   aarch64_previous_fndecl = NULL;
14793 }
14794
14795 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
14796    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
14797    make sure optab availability predicates are recomputed when necessary.  */
14798
14799 void
14800 aarch64_save_restore_target_globals (tree new_tree)
14801 {
14802   if (TREE_TARGET_GLOBALS (new_tree))
14803     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
14804   else if (new_tree == target_option_default_node)
14805     restore_target_globals (&default_target_globals);
14806   else
14807     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
14808 }
14809
14810 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
14811    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
14812    of the function, if such exists.  This function may be called multiple
14813    times on a single function so use aarch64_previous_fndecl to avoid
14814    setting up identical state.  */
14815
14816 static void
14817 aarch64_set_current_function (tree fndecl)
14818 {
14819   if (!fndecl || fndecl == aarch64_previous_fndecl)
14820     return;
14821
14822   tree old_tree = (aarch64_previous_fndecl
14823                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
14824                    : NULL_TREE);
14825
14826   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14827
14828   /* If current function has no attributes but the previous one did,
14829      use the default node.  */
14830   if (!new_tree && old_tree)
14831     new_tree = target_option_default_node;
14832
14833   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
14834      the default have been handled by aarch64_save_restore_target_globals from
14835      aarch64_pragma_target_parse.  */
14836   if (old_tree == new_tree)
14837     return;
14838
14839   aarch64_previous_fndecl = fndecl;
14840
14841   /* First set the target options.  */
14842   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
14843
14844   aarch64_save_restore_target_globals (new_tree);
14845 }
14846
14847 /* Enum describing the various ways we can handle attributes.
14848    In many cases we can reuse the generic option handling machinery.  */
14849
14850 enum aarch64_attr_opt_type
14851 {
14852   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
14853   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
14854   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
14855   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
14856 };
14857
14858 /* All the information needed to handle a target attribute.
14859    NAME is the name of the attribute.
14860    ATTR_TYPE specifies the type of behavior of the attribute as described
14861    in the definition of enum aarch64_attr_opt_type.
14862    ALLOW_NEG is true if the attribute supports a "no-" form.
14863    HANDLER is the function that takes the attribute string as an argument
14864    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
14865    OPT_NUM is the enum specifying the option that the attribute modifies.
14866    This is needed for attributes that mirror the behavior of a command-line
14867    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
14868    aarch64_attr_enum.  */
14869
14870 struct aarch64_attribute_info
14871 {
14872   const char *name;
14873   enum aarch64_attr_opt_type attr_type;
14874   bool allow_neg;
14875   bool (*handler) (const char *);
14876   enum opt_code opt_num;
14877 };
14878
14879 /* Handle the ARCH_STR argument to the arch= target attribute.  */
14880
14881 static bool
14882 aarch64_handle_attr_arch (const char *str)
14883 {
14884   const struct processor *tmp_arch = NULL;
14885   std::string invalid_extension;
14886   enum aarch64_parse_opt_result parse_res
14887     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
14888
14889   if (parse_res == AARCH64_PARSE_OK)
14890     {
14891       gcc_assert (tmp_arch);
14892       selected_arch = tmp_arch;
14893       explicit_arch = selected_arch->arch;
14894       return true;
14895     }
14896
14897   switch (parse_res)
14898     {
14899       case AARCH64_PARSE_MISSING_ARG:
14900         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
14901         break;
14902       case AARCH64_PARSE_INVALID_ARG:
14903         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
14904         aarch64_print_hint_for_arch (str);
14905         break;
14906       case AARCH64_PARSE_INVALID_FEATURE:
14907         error ("invalid feature modifier %s of value (\"%s\") in "
14908                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
14909         aarch64_print_hint_for_extensions (invalid_extension);
14910         break;
14911       default:
14912         gcc_unreachable ();
14913     }
14914
14915   return false;
14916 }
14917
14918 /* Handle the argument CPU_STR to the cpu= target attribute.  */
14919
14920 static bool
14921 aarch64_handle_attr_cpu (const char *str)
14922 {
14923   const struct processor *tmp_cpu = NULL;
14924   std::string invalid_extension;
14925   enum aarch64_parse_opt_result parse_res
14926     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
14927
14928   if (parse_res == AARCH64_PARSE_OK)
14929     {
14930       gcc_assert (tmp_cpu);
14931       selected_tune = tmp_cpu;
14932       explicit_tune_core = selected_tune->ident;
14933
14934       selected_arch = &all_architectures[tmp_cpu->arch];
14935       explicit_arch = selected_arch->arch;
14936       return true;
14937     }
14938
14939   switch (parse_res)
14940     {
14941       case AARCH64_PARSE_MISSING_ARG:
14942         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
14943         break;
14944       case AARCH64_PARSE_INVALID_ARG:
14945         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
14946         aarch64_print_hint_for_core (str);
14947         break;
14948       case AARCH64_PARSE_INVALID_FEATURE:
14949         error ("invalid feature modifier %s of value (\"%s\") in "
14950                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
14951         aarch64_print_hint_for_extensions (invalid_extension);
14952         break;
14953       default:
14954         gcc_unreachable ();
14955     }
14956
14957   return false;
14958 }
14959
14960 /* Handle the argument STR to the branch-protection= attribute.  */
14961
14962  static bool
14963  aarch64_handle_attr_branch_protection (const char* str)
14964  {
14965   char *err_str = (char *) xmalloc (strlen (str) + 1);
14966   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
14967                                                                       &err_str);
14968   bool success = false;
14969   switch (res)
14970     {
14971      case AARCH64_PARSE_MISSING_ARG:
14972        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
14973               " attribute");
14974        break;
14975      case AARCH64_PARSE_INVALID_ARG:
14976        error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
14977               "=\")%> pragma or attribute", err_str);
14978        break;
14979      case AARCH64_PARSE_OK:
14980        success = true;
14981       /* Fall through.  */
14982      case AARCH64_PARSE_INVALID_FEATURE:
14983        break;
14984      default:
14985        gcc_unreachable ();
14986     }
14987   free (err_str);
14988   return success;
14989  }
14990
14991 /* Handle the argument STR to the tune= target attribute.  */
14992
14993 static bool
14994 aarch64_handle_attr_tune (const char *str)
14995 {
14996   const struct processor *tmp_tune = NULL;
14997   enum aarch64_parse_opt_result parse_res
14998     = aarch64_parse_tune (str, &tmp_tune);
14999
15000   if (parse_res == AARCH64_PARSE_OK)
15001     {
15002       gcc_assert (tmp_tune);
15003       selected_tune = tmp_tune;
15004       explicit_tune_core = selected_tune->ident;
15005       return true;
15006     }
15007
15008   switch (parse_res)
15009     {
15010       case AARCH64_PARSE_INVALID_ARG:
15011         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
15012         aarch64_print_hint_for_core (str);
15013         break;
15014       default:
15015         gcc_unreachable ();
15016     }
15017
15018   return false;
15019 }
15020
15021 /* Parse an architecture extensions target attribute string specified in STR.
15022    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
15023    if successful.  Update aarch64_isa_flags to reflect the ISA features
15024    modified.  */
15025
15026 static bool
15027 aarch64_handle_attr_isa_flags (char *str)
15028 {
15029   enum aarch64_parse_opt_result parse_res;
15030   uint64_t isa_flags = aarch64_isa_flags;
15031
15032   /* We allow "+nothing" in the beginning to clear out all architectural
15033      features if the user wants to handpick specific features.  */
15034   if (strncmp ("+nothing", str, 8) == 0)
15035     {
15036       isa_flags = 0;
15037       str += 8;
15038     }
15039
15040   std::string invalid_extension;
15041   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
15042
15043   if (parse_res == AARCH64_PARSE_OK)
15044     {
15045       aarch64_isa_flags = isa_flags;
15046       return true;
15047     }
15048
15049   switch (parse_res)
15050     {
15051       case AARCH64_PARSE_MISSING_ARG:
15052         error ("missing value in %<target()%> pragma or attribute");
15053         break;
15054
15055       case AARCH64_PARSE_INVALID_FEATURE:
15056         error ("invalid feature modifier %s of value (\"%s\") in "
15057                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
15058         break;
15059
15060       default:
15061         gcc_unreachable ();
15062     }
15063
15064  return false;
15065 }
15066
15067 /* The target attributes that we support.  On top of these we also support just
15068    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
15069    handled explicitly in aarch64_process_one_target_attr.  */
15070
15071 static const struct aarch64_attribute_info aarch64_attributes[] =
15072 {
15073   { "general-regs-only", aarch64_attr_mask, false, NULL,
15074      OPT_mgeneral_regs_only },
15075   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
15076      OPT_mfix_cortex_a53_835769 },
15077   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
15078      OPT_mfix_cortex_a53_843419 },
15079   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
15080   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
15081   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
15082      OPT_momit_leaf_frame_pointer },
15083   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
15084   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
15085      OPT_march_ },
15086   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
15087   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
15088      OPT_mtune_ },
15089   { "branch-protection", aarch64_attr_custom, false,
15090      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
15091   { "sign-return-address", aarch64_attr_enum, false, NULL,
15092      OPT_msign_return_address_ },
15093   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
15094 };
15095
15096 /* Parse ARG_STR which contains the definition of one target attribute.
15097    Show appropriate errors if any or return true if the attribute is valid.  */
15098
15099 static bool
15100 aarch64_process_one_target_attr (char *arg_str)
15101 {
15102   bool invert = false;
15103
15104   size_t len = strlen (arg_str);
15105
15106   if (len == 0)
15107     {
15108       error ("malformed %<target()%> pragma or attribute");
15109       return false;
15110     }
15111
15112   char *str_to_check = (char *) alloca (len + 1);
15113   strcpy (str_to_check, arg_str);
15114
15115   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
15116      It is easier to detect and handle it explicitly here rather than going
15117      through the machinery for the rest of the target attributes in this
15118      function.  */
15119   if (*str_to_check == '+')
15120     return aarch64_handle_attr_isa_flags (str_to_check);
15121
15122   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
15123     {
15124       invert = true;
15125       str_to_check += 3;
15126     }
15127   char *arg = strchr (str_to_check, '=');
15128
15129   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
15130      and point ARG to "foo".  */
15131   if (arg)
15132     {
15133       *arg = '\0';
15134       arg++;
15135     }
15136   const struct aarch64_attribute_info *p_attr;
15137   bool found = false;
15138   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
15139     {
15140       /* If the names don't match up, or the user has given an argument
15141          to an attribute that doesn't accept one, or didn't give an argument
15142          to an attribute that expects one, fail to match.  */
15143       if (strcmp (str_to_check, p_attr->name) != 0)
15144         continue;
15145
15146       found = true;
15147       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
15148                               || p_attr->attr_type == aarch64_attr_enum;
15149
15150       if (attr_need_arg_p ^ (arg != NULL))
15151         {
15152           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
15153           return false;
15154         }
15155
15156       /* If the name matches but the attribute does not allow "no-" versions
15157          then we can't match.  */
15158       if (invert && !p_attr->allow_neg)
15159         {
15160           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
15161           return false;
15162         }
15163
15164       switch (p_attr->attr_type)
15165         {
15166         /* Has a custom handler registered.
15167            For example, cpu=, arch=, tune=.  */
15168           case aarch64_attr_custom:
15169             gcc_assert (p_attr->handler);
15170             if (!p_attr->handler (arg))
15171               return false;
15172             break;
15173
15174           /* Either set or unset a boolean option.  */
15175           case aarch64_attr_bool:
15176             {
15177               struct cl_decoded_option decoded;
15178
15179               generate_option (p_attr->opt_num, NULL, !invert,
15180                                CL_TARGET, &decoded);
15181               aarch64_handle_option (&global_options, &global_options_set,
15182                                       &decoded, input_location);
15183               break;
15184             }
15185           /* Set or unset a bit in the target_flags.  aarch64_handle_option
15186              should know what mask to apply given the option number.  */
15187           case aarch64_attr_mask:
15188             {
15189               struct cl_decoded_option decoded;
15190               /* We only need to specify the option number.
15191                  aarch64_handle_option will know which mask to apply.  */
15192               decoded.opt_index = p_attr->opt_num;
15193               decoded.value = !invert;
15194               aarch64_handle_option (&global_options, &global_options_set,
15195                                       &decoded, input_location);
15196               break;
15197             }
15198           /* Use the option setting machinery to set an option to an enum.  */
15199           case aarch64_attr_enum:
15200             {
15201               gcc_assert (arg);
15202               bool valid;
15203               int value;
15204               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
15205                                               &value, CL_TARGET);
15206               if (valid)
15207                 {
15208                   set_option (&global_options, NULL, p_attr->opt_num, value,
15209                               NULL, DK_UNSPECIFIED, input_location,
15210                               global_dc);
15211                 }
15212               else
15213                 {
15214                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
15215                 }
15216               break;
15217             }
15218           default:
15219             gcc_unreachable ();
15220         }
15221     }
15222
15223   /* If we reached here we either have found an attribute and validated
15224      it or didn't match any.  If we matched an attribute but its arguments
15225      were malformed we will have returned false already.  */
15226   return found;
15227 }
15228
15229 /* Count how many times the character C appears in
15230    NULL-terminated string STR.  */
15231
15232 static unsigned int
15233 num_occurences_in_str (char c, char *str)
15234 {
15235   unsigned int res = 0;
15236   while (*str != '\0')
15237     {
15238       if (*str == c)
15239         res++;
15240
15241       str++;
15242     }
15243
15244   return res;
15245 }
15246
15247 /* Parse the tree in ARGS that contains the target attribute information
15248    and update the global target options space.  */
15249
15250 bool
15251 aarch64_process_target_attr (tree args)
15252 {
15253   if (TREE_CODE (args) == TREE_LIST)
15254     {
15255       do
15256         {
15257           tree head = TREE_VALUE (args);
15258           if (head)
15259             {
15260               if (!aarch64_process_target_attr (head))
15261                 return false;
15262             }
15263           args = TREE_CHAIN (args);
15264         } while (args);
15265
15266       return true;
15267     }
15268
15269   if (TREE_CODE (args) != STRING_CST)
15270     {
15271       error ("attribute %<target%> argument not a string");
15272       return false;
15273     }
15274
15275   size_t len = strlen (TREE_STRING_POINTER (args));
15276   char *str_to_check = (char *) alloca (len + 1);
15277   strcpy (str_to_check, TREE_STRING_POINTER (args));
15278
15279   if (len == 0)
15280     {
15281       error ("malformed %<target()%> pragma or attribute");
15282       return false;
15283     }
15284
15285   /* Used to catch empty spaces between commas i.e.
15286      attribute ((target ("attr1,,attr2"))).  */
15287   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
15288
15289   /* Handle multiple target attributes separated by ','.  */
15290   char *token = strtok_r (str_to_check, ",", &str_to_check);
15291
15292   unsigned int num_attrs = 0;
15293   while (token)
15294     {
15295       num_attrs++;
15296       if (!aarch64_process_one_target_attr (token))
15297         {
15298           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
15299           return false;
15300         }
15301
15302       token = strtok_r (NULL, ",", &str_to_check);
15303     }
15304
15305   if (num_attrs != num_commas + 1)
15306     {
15307       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
15308       return false;
15309     }
15310
15311   return true;
15312 }
15313
15314 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
15315    process attribute ((target ("..."))).  */
15316
15317 static bool
15318 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
15319 {
15320   struct cl_target_option cur_target;
15321   bool ret;
15322   tree old_optimize;
15323   tree new_target, new_optimize;
15324   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
15325
15326   /* If what we're processing is the current pragma string then the
15327      target option node is already stored in target_option_current_node
15328      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
15329      having to re-parse the string.  This is especially useful to keep
15330      arm_neon.h compile times down since that header contains a lot
15331      of intrinsics enclosed in pragmas.  */
15332   if (!existing_target && args == current_target_pragma)
15333     {
15334       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
15335       return true;
15336     }
15337   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
15338
15339   old_optimize = build_optimization_node (&global_options);
15340   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
15341
15342   /* If the function changed the optimization levels as well as setting
15343      target options, start with the optimizations specified.  */
15344   if (func_optimize && func_optimize != old_optimize)
15345     cl_optimization_restore (&global_options,
15346                              TREE_OPTIMIZATION (func_optimize));
15347
15348   /* Save the current target options to restore at the end.  */
15349   cl_target_option_save (&cur_target, &global_options);
15350
15351   /* If fndecl already has some target attributes applied to it, unpack
15352      them so that we add this attribute on top of them, rather than
15353      overwriting them.  */
15354   if (existing_target)
15355     {
15356       struct cl_target_option *existing_options
15357         = TREE_TARGET_OPTION (existing_target);
15358
15359       if (existing_options)
15360         cl_target_option_restore (&global_options, existing_options);
15361     }
15362   else
15363     cl_target_option_restore (&global_options,
15364                         TREE_TARGET_OPTION (target_option_current_node));
15365
15366   ret = aarch64_process_target_attr (args);
15367
15368   /* Set up any additional state.  */
15369   if (ret)
15370     {
15371       aarch64_override_options_internal (&global_options);
15372       /* Initialize SIMD builtins if we haven't already.
15373          Set current_target_pragma to NULL for the duration so that
15374          the builtin initialization code doesn't try to tag the functions
15375          being built with the attributes specified by any current pragma, thus
15376          going into an infinite recursion.  */
15377       if (TARGET_SIMD)
15378         {
15379           tree saved_current_target_pragma = current_target_pragma;
15380           current_target_pragma = NULL;
15381           aarch64_init_simd_builtins ();
15382           current_target_pragma = saved_current_target_pragma;
15383         }
15384       new_target = build_target_option_node (&global_options);
15385     }
15386   else
15387     new_target = NULL;
15388
15389   new_optimize = build_optimization_node (&global_options);
15390
15391   if (fndecl && ret)
15392     {
15393       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
15394
15395       if (old_optimize != new_optimize)
15396         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
15397     }
15398
15399   cl_target_option_restore (&global_options, &cur_target);
15400
15401   if (old_optimize != new_optimize)
15402     cl_optimization_restore (&global_options,
15403                              TREE_OPTIMIZATION (old_optimize));
15404   return ret;
15405 }
15406
15407 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
15408    tri-bool options (yes, no, don't care) and the default value is
15409    DEF, determine whether to reject inlining.  */
15410
15411 static bool
15412 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
15413                                      int dont_care, int def)
15414 {
15415   /* If the callee doesn't care, always allow inlining.  */
15416   if (callee == dont_care)
15417     return true;
15418
15419   /* If the caller doesn't care, always allow inlining.  */
15420   if (caller == dont_care)
15421     return true;
15422
15423   /* Otherwise, allow inlining if either the callee and caller values
15424      agree, or if the callee is using the default value.  */
15425   return (callee == caller || callee == def);
15426 }
15427
15428 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
15429    to inline CALLEE into CALLER based on target-specific info.
15430    Make sure that the caller and callee have compatible architectural
15431    features.  Then go through the other possible target attributes
15432    and see if they can block inlining.  Try not to reject always_inline
15433    callees unless they are incompatible architecturally.  */
15434
15435 static bool
15436 aarch64_can_inline_p (tree caller, tree callee)
15437 {
15438   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
15439   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
15440
15441   struct cl_target_option *caller_opts
15442         = TREE_TARGET_OPTION (caller_tree ? caller_tree
15443                                            : target_option_default_node);
15444
15445   struct cl_target_option *callee_opts
15446         = TREE_TARGET_OPTION (callee_tree ? callee_tree
15447                                            : target_option_default_node);
15448
15449   /* Callee's ISA flags should be a subset of the caller's.  */
15450   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
15451        != callee_opts->x_aarch64_isa_flags)
15452     return false;
15453
15454   /* Allow non-strict aligned functions inlining into strict
15455      aligned ones.  */
15456   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
15457        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
15458       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
15459            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
15460     return false;
15461
15462   bool always_inline = lookup_attribute ("always_inline",
15463                                           DECL_ATTRIBUTES (callee));
15464
15465   /* If the architectural features match up and the callee is always_inline
15466      then the other attributes don't matter.  */
15467   if (always_inline)
15468     return true;
15469
15470   if (caller_opts->x_aarch64_cmodel_var
15471       != callee_opts->x_aarch64_cmodel_var)
15472     return false;
15473
15474   if (caller_opts->x_aarch64_tls_dialect
15475       != callee_opts->x_aarch64_tls_dialect)
15476     return false;
15477
15478   /* Honour explicit requests to workaround errata.  */
15479   if (!aarch64_tribools_ok_for_inlining_p (
15480           caller_opts->x_aarch64_fix_a53_err835769,
15481           callee_opts->x_aarch64_fix_a53_err835769,
15482           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
15483     return false;
15484
15485   if (!aarch64_tribools_ok_for_inlining_p (
15486           caller_opts->x_aarch64_fix_a53_err843419,
15487           callee_opts->x_aarch64_fix_a53_err843419,
15488           2, TARGET_FIX_ERR_A53_843419))
15489     return false;
15490
15491   /* If the user explicitly specified -momit-leaf-frame-pointer for the
15492      caller and calle and they don't match up, reject inlining.  */
15493   if (!aarch64_tribools_ok_for_inlining_p (
15494           caller_opts->x_flag_omit_leaf_frame_pointer,
15495           callee_opts->x_flag_omit_leaf_frame_pointer,
15496           2, 1))
15497     return false;
15498
15499   /* If the callee has specific tuning overrides, respect them.  */
15500   if (callee_opts->x_aarch64_override_tune_string != NULL
15501       && caller_opts->x_aarch64_override_tune_string == NULL)
15502     return false;
15503
15504   /* If the user specified tuning override strings for the
15505      caller and callee and they don't match up, reject inlining.
15506      We just do a string compare here, we don't analyze the meaning
15507      of the string, as it would be too costly for little gain.  */
15508   if (callee_opts->x_aarch64_override_tune_string
15509       && caller_opts->x_aarch64_override_tune_string
15510       && (strcmp (callee_opts->x_aarch64_override_tune_string,
15511                   caller_opts->x_aarch64_override_tune_string) != 0))
15512     return false;
15513
15514   return true;
15515 }
15516
15517 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
15518    been already.  */
15519
15520 unsigned int
15521 aarch64_tlsdesc_abi_id ()
15522 {
15523   predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
15524   if (!tlsdesc_abi.initialized_p ())
15525     {
15526       HARD_REG_SET full_reg_clobbers;
15527       CLEAR_HARD_REG_SET (full_reg_clobbers);
15528       SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
15529       SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
15530       for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
15531         SET_HARD_REG_BIT (full_reg_clobbers, regno);
15532       tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
15533     }
15534   return tlsdesc_abi.id ();
15535 }
15536
15537 /* Return true if SYMBOL_REF X binds locally.  */
15538
15539 static bool
15540 aarch64_symbol_binds_local_p (const_rtx x)
15541 {
15542   return (SYMBOL_REF_DECL (x)
15543           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
15544           : SYMBOL_REF_LOCAL_P (x));
15545 }
15546
15547 /* Return true if SYMBOL_REF X is thread local */
15548 static bool
15549 aarch64_tls_symbol_p (rtx x)
15550 {
15551   if (! TARGET_HAVE_TLS)
15552     return false;
15553
15554   if (GET_CODE (x) != SYMBOL_REF)
15555     return false;
15556
15557   return SYMBOL_REF_TLS_MODEL (x) != 0;
15558 }
15559
15560 /* Classify a TLS symbol into one of the TLS kinds.  */
15561 enum aarch64_symbol_type
15562 aarch64_classify_tls_symbol (rtx x)
15563 {
15564   enum tls_model tls_kind = tls_symbolic_operand_type (x);
15565
15566   switch (tls_kind)
15567     {
15568     case TLS_MODEL_GLOBAL_DYNAMIC:
15569     case TLS_MODEL_LOCAL_DYNAMIC:
15570       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
15571
15572     case TLS_MODEL_INITIAL_EXEC:
15573       switch (aarch64_cmodel)
15574         {
15575         case AARCH64_CMODEL_TINY:
15576         case AARCH64_CMODEL_TINY_PIC:
15577           return SYMBOL_TINY_TLSIE;
15578         default:
15579           return SYMBOL_SMALL_TLSIE;
15580         }
15581
15582     case TLS_MODEL_LOCAL_EXEC:
15583       if (aarch64_tls_size == 12)
15584         return SYMBOL_TLSLE12;
15585       else if (aarch64_tls_size == 24)
15586         return SYMBOL_TLSLE24;
15587       else if (aarch64_tls_size == 32)
15588         return SYMBOL_TLSLE32;
15589       else if (aarch64_tls_size == 48)
15590         return SYMBOL_TLSLE48;
15591       else
15592         gcc_unreachable ();
15593
15594     case TLS_MODEL_EMULATED:
15595     case TLS_MODEL_NONE:
15596       return SYMBOL_FORCE_TO_MEM;
15597
15598     default:
15599       gcc_unreachable ();
15600     }
15601 }
15602
15603 /* Return the correct method for accessing X + OFFSET, where X is either
15604    a SYMBOL_REF or LABEL_REF.  */
15605
15606 enum aarch64_symbol_type
15607 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
15608 {
15609   if (GET_CODE (x) == LABEL_REF)
15610     {
15611       switch (aarch64_cmodel)
15612         {
15613         case AARCH64_CMODEL_LARGE:
15614           return SYMBOL_FORCE_TO_MEM;
15615
15616         case AARCH64_CMODEL_TINY_PIC:
15617         case AARCH64_CMODEL_TINY:
15618           return SYMBOL_TINY_ABSOLUTE;
15619
15620         case AARCH64_CMODEL_SMALL_SPIC:
15621         case AARCH64_CMODEL_SMALL_PIC:
15622         case AARCH64_CMODEL_SMALL:
15623           return SYMBOL_SMALL_ABSOLUTE;
15624
15625         default:
15626           gcc_unreachable ();
15627         }
15628     }
15629
15630   if (GET_CODE (x) == SYMBOL_REF)
15631     {
15632       if (aarch64_tls_symbol_p (x))
15633         return aarch64_classify_tls_symbol (x);
15634
15635       switch (aarch64_cmodel)
15636         {
15637         case AARCH64_CMODEL_TINY:
15638           /* When we retrieve symbol + offset address, we have to make sure
15639              the offset does not cause overflow of the final address.  But
15640              we have no way of knowing the address of symbol at compile time
15641              so we can't accurately say if the distance between the PC and
15642              symbol + offset is outside the addressible range of +/-1MB in the
15643              TINY code model.  So we limit the maximum offset to +/-64KB and
15644              assume the offset to the symbol is not larger than +/-(1MB - 64KB).
15645              If offset_within_block_p is true we allow larger offsets.
15646              Furthermore force to memory if the symbol is a weak reference to
15647              something that doesn't resolve to a symbol in this module.  */
15648
15649           if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
15650             return SYMBOL_FORCE_TO_MEM;
15651           if (!(IN_RANGE (offset, -0x10000, 0x10000)
15652                 || offset_within_block_p (x, offset)))
15653             return SYMBOL_FORCE_TO_MEM;
15654
15655           return SYMBOL_TINY_ABSOLUTE;
15656
15657         case AARCH64_CMODEL_SMALL:
15658           /* Same reasoning as the tiny code model, but the offset cap here is
15659              1MB, allowing +/-3.9GB for the offset to the symbol.  */
15660
15661           if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
15662             return SYMBOL_FORCE_TO_MEM;
15663           if (!(IN_RANGE (offset, -0x100000, 0x100000)
15664                 || offset_within_block_p (x, offset)))
15665             return SYMBOL_FORCE_TO_MEM;
15666
15667           return SYMBOL_SMALL_ABSOLUTE;
15668
15669         case AARCH64_CMODEL_TINY_PIC:
15670           if (!aarch64_symbol_binds_local_p (x))
15671             return SYMBOL_TINY_GOT;
15672           return SYMBOL_TINY_ABSOLUTE;
15673
15674         case AARCH64_CMODEL_SMALL_SPIC:
15675         case AARCH64_CMODEL_SMALL_PIC:
15676           if (!aarch64_symbol_binds_local_p (x))
15677             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
15678                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
15679           return SYMBOL_SMALL_ABSOLUTE;
15680
15681         case AARCH64_CMODEL_LARGE:
15682           /* This is alright even in PIC code as the constant
15683              pool reference is always PC relative and within
15684              the same translation unit.  */
15685           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
15686             return SYMBOL_SMALL_ABSOLUTE;
15687           else
15688             return SYMBOL_FORCE_TO_MEM;
15689
15690         default:
15691           gcc_unreachable ();
15692         }
15693     }
15694
15695   /* By default push everything into the constant pool.  */
15696   return SYMBOL_FORCE_TO_MEM;
15697 }
15698
15699 bool
15700 aarch64_constant_address_p (rtx x)
15701 {
15702   return (CONSTANT_P (x) && memory_address_p (DImode, x));
15703 }
15704
15705 bool
15706 aarch64_legitimate_pic_operand_p (rtx x)
15707 {
15708   if (GET_CODE (x) == SYMBOL_REF
15709       || (GET_CODE (x) == CONST
15710           && GET_CODE (XEXP (x, 0)) == PLUS
15711           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
15712      return false;
15713
15714   return true;
15715 }
15716
15717 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
15718    that should be rematerialized rather than spilled.  */
15719
15720 static bool
15721 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
15722 {
15723   /* Support CSE and rematerialization of common constants.  */
15724   if (CONST_INT_P (x)
15725       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
15726       || GET_CODE (x) == CONST_VECTOR)
15727     return true;
15728
15729   /* Do not allow vector struct mode constants for Advanced SIMD.
15730      We could support 0 and -1 easily, but they need support in
15731      aarch64-simd.md.  */
15732   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15733   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15734     return false;
15735
15736   /* Only accept variable-length vector constants if they can be
15737      handled directly.
15738
15739      ??? It would be possible to handle rematerialization of other
15740      constants via secondary reloads.  */
15741   if (vec_flags & VEC_ANY_SVE)
15742     return aarch64_simd_valid_immediate (x, NULL);
15743
15744   if (GET_CODE (x) == HIGH)
15745     x = XEXP (x, 0);
15746
15747   /* Accept polynomial constants that can be calculated by using the
15748      destination of a move as the sole temporary.  Constants that
15749      require a second temporary cannot be rematerialized (they can't be
15750      forced to memory and also aren't legitimate constants).  */
15751   poly_int64 offset;
15752   if (poly_int_rtx_p (x, &offset))
15753     return aarch64_offset_temporaries (false, offset) <= 1;
15754
15755   /* If an offset is being added to something else, we need to allow the
15756      base to be moved into the destination register, meaning that there
15757      are no free temporaries for the offset.  */
15758   x = strip_offset (x, &offset);
15759   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
15760     return false;
15761
15762   /* Do not allow const (plus (anchor_symbol, const_int)).  */
15763   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
15764     return false;
15765
15766   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
15767      so spilling them is better than rematerialization.  */
15768   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
15769     return true;
15770
15771   /* Label references are always constant.  */
15772   if (GET_CODE (x) == LABEL_REF)
15773     return true;
15774
15775   return false;
15776 }
15777
15778 rtx
15779 aarch64_load_tp (rtx target)
15780 {
15781   if (!target
15782       || GET_MODE (target) != Pmode
15783       || !register_operand (target, Pmode))
15784     target = gen_reg_rtx (Pmode);
15785
15786   /* Can return in any reg.  */
15787   emit_insn (gen_aarch64_load_tp_hard (target));
15788   return target;
15789 }
15790
15791 /* On AAPCS systems, this is the "struct __va_list".  */
15792 static GTY(()) tree va_list_type;
15793
15794 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
15795    Return the type to use as __builtin_va_list.
15796
15797    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
15798
15799    struct __va_list
15800    {
15801      void *__stack;
15802      void *__gr_top;
15803      void *__vr_top;
15804      int   __gr_offs;
15805      int   __vr_offs;
15806    };  */
15807
15808 static tree
15809 aarch64_build_builtin_va_list (void)
15810 {
15811   tree va_list_name;
15812   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
15813
15814   /* Create the type.  */
15815   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
15816   /* Give it the required name.  */
15817   va_list_name = build_decl (BUILTINS_LOCATION,
15818                              TYPE_DECL,
15819                              get_identifier ("__va_list"),
15820                              va_list_type);
15821   DECL_ARTIFICIAL (va_list_name) = 1;
15822   TYPE_NAME (va_list_type) = va_list_name;
15823   TYPE_STUB_DECL (va_list_type) = va_list_name;
15824
15825   /* Create the fields.  */
15826   f_stack = build_decl (BUILTINS_LOCATION,
15827                         FIELD_DECL, get_identifier ("__stack"),
15828                         ptr_type_node);
15829   f_grtop = build_decl (BUILTINS_LOCATION,
15830                         FIELD_DECL, get_identifier ("__gr_top"),
15831                         ptr_type_node);
15832   f_vrtop = build_decl (BUILTINS_LOCATION,
15833                         FIELD_DECL, get_identifier ("__vr_top"),
15834                         ptr_type_node);
15835   f_groff = build_decl (BUILTINS_LOCATION,
15836                         FIELD_DECL, get_identifier ("__gr_offs"),
15837                         integer_type_node);
15838   f_vroff = build_decl (BUILTINS_LOCATION,
15839                         FIELD_DECL, get_identifier ("__vr_offs"),
15840                         integer_type_node);
15841
15842   /* Tell tree-stdarg pass about our internal offset fields.
15843      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
15844      purpose to identify whether the code is updating va_list internal
15845      offset fields through irregular way.  */
15846   va_list_gpr_counter_field = f_groff;
15847   va_list_fpr_counter_field = f_vroff;
15848
15849   DECL_ARTIFICIAL (f_stack) = 1;
15850   DECL_ARTIFICIAL (f_grtop) = 1;
15851   DECL_ARTIFICIAL (f_vrtop) = 1;
15852   DECL_ARTIFICIAL (f_groff) = 1;
15853   DECL_ARTIFICIAL (f_vroff) = 1;
15854
15855   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
15856   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
15857   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
15858   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
15859   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
15860
15861   TYPE_FIELDS (va_list_type) = f_stack;
15862   DECL_CHAIN (f_stack) = f_grtop;
15863   DECL_CHAIN (f_grtop) = f_vrtop;
15864   DECL_CHAIN (f_vrtop) = f_groff;
15865   DECL_CHAIN (f_groff) = f_vroff;
15866
15867   /* Compute its layout.  */
15868   layout_type (va_list_type);
15869
15870   return va_list_type;
15871 }
15872
15873 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
15874 static void
15875 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
15876 {
15877   const CUMULATIVE_ARGS *cum;
15878   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
15879   tree stack, grtop, vrtop, groff, vroff;
15880   tree t;
15881   int gr_save_area_size = cfun->va_list_gpr_size;
15882   int vr_save_area_size = cfun->va_list_fpr_size;
15883   int vr_offset;
15884
15885   cum = &crtl->args.info;
15886   if (cfun->va_list_gpr_size)
15887     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
15888                              cfun->va_list_gpr_size);
15889   if (cfun->va_list_fpr_size)
15890     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
15891                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
15892
15893   if (!TARGET_FLOAT)
15894     {
15895       gcc_assert (cum->aapcs_nvrn == 0);
15896       vr_save_area_size = 0;
15897     }
15898
15899   f_stack = TYPE_FIELDS (va_list_type_node);
15900   f_grtop = DECL_CHAIN (f_stack);
15901   f_vrtop = DECL_CHAIN (f_grtop);
15902   f_groff = DECL_CHAIN (f_vrtop);
15903   f_vroff = DECL_CHAIN (f_groff);
15904
15905   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
15906                   NULL_TREE);
15907   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
15908                   NULL_TREE);
15909   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
15910                   NULL_TREE);
15911   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
15912                   NULL_TREE);
15913   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
15914                   NULL_TREE);
15915
15916   /* Emit code to initialize STACK, which points to the next varargs stack
15917      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
15918      by named arguments.  STACK is 8-byte aligned.  */
15919   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
15920   if (cum->aapcs_stack_size > 0)
15921     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
15922   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
15923   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15924
15925   /* Emit code to initialize GRTOP, the top of the GR save area.
15926      virtual_incoming_args_rtx should have been 16 byte aligned.  */
15927   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
15928   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
15929   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15930
15931   /* Emit code to initialize VRTOP, the top of the VR save area.
15932      This address is gr_save_area_bytes below GRTOP, rounded
15933      down to the next 16-byte boundary.  */
15934   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
15935   vr_offset = ROUND_UP (gr_save_area_size,
15936                         STACK_BOUNDARY / BITS_PER_UNIT);
15937
15938   if (vr_offset)
15939     t = fold_build_pointer_plus_hwi (t, -vr_offset);
15940   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
15941   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15942
15943   /* Emit code to initialize GROFF, the offset from GRTOP of the
15944      next GPR argument.  */
15945   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
15946               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
15947   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15948
15949   /* Likewise emit code to initialize VROFF, the offset from FTOP
15950      of the next VR argument.  */
15951   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
15952               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
15953   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15954 }
15955
15956 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
15957
15958 static tree
15959 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
15960                               gimple_seq *post_p ATTRIBUTE_UNUSED)
15961 {
15962   tree addr;
15963   bool indirect_p;
15964   bool is_ha;           /* is HFA or HVA.  */
15965   bool dw_align;        /* double-word align.  */
15966   machine_mode ag_mode = VOIDmode;
15967   int nregs;
15968   machine_mode mode;
15969
15970   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
15971   tree stack, f_top, f_off, off, arg, roundup, on_stack;
15972   HOST_WIDE_INT size, rsize, adjust, align;
15973   tree t, u, cond1, cond2;
15974
15975   indirect_p = pass_va_arg_by_reference (type);
15976   if (indirect_p)
15977     type = build_pointer_type (type);
15978
15979   mode = TYPE_MODE (type);
15980
15981   f_stack = TYPE_FIELDS (va_list_type_node);
15982   f_grtop = DECL_CHAIN (f_stack);
15983   f_vrtop = DECL_CHAIN (f_grtop);
15984   f_groff = DECL_CHAIN (f_vrtop);
15985   f_vroff = DECL_CHAIN (f_groff);
15986
15987   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
15988                   f_stack, NULL_TREE);
15989   size = int_size_in_bytes (type);
15990
15991   bool abi_break;
15992   align
15993     = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
15994
15995   dw_align = false;
15996   adjust = 0;
15997   if (aarch64_vfp_is_call_or_return_candidate (mode,
15998                                                type,
15999                                                &ag_mode,
16000                                                &nregs,
16001                                                &is_ha))
16002     {
16003       /* No frontends can create types with variable-sized modes, so we
16004          shouldn't be asked to pass or return them.  */
16005       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
16006
16007       /* TYPE passed in fp/simd registers.  */
16008       if (!TARGET_FLOAT)
16009         aarch64_err_no_fpadvsimd (mode);
16010
16011       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
16012                       unshare_expr (valist), f_vrtop, NULL_TREE);
16013       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
16014                       unshare_expr (valist), f_vroff, NULL_TREE);
16015
16016       rsize = nregs * UNITS_PER_VREG;
16017
16018       if (is_ha)
16019         {
16020           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
16021             adjust = UNITS_PER_VREG - ag_size;
16022         }
16023       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
16024                && size < UNITS_PER_VREG)
16025         {
16026           adjust = UNITS_PER_VREG - size;
16027         }
16028     }
16029   else
16030     {
16031       /* TYPE passed in general registers.  */
16032       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
16033                       unshare_expr (valist), f_grtop, NULL_TREE);
16034       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
16035                       unshare_expr (valist), f_groff, NULL_TREE);
16036       rsize = ROUND_UP (size, UNITS_PER_WORD);
16037       nregs = rsize / UNITS_PER_WORD;
16038
16039       if (align > 8)
16040         {
16041           if (abi_break && warn_psabi)
16042             inform (input_location, "parameter passing for argument of type "
16043                     "%qT changed in GCC 9.1", type);
16044           dw_align = true;
16045         }
16046
16047       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
16048           && size < UNITS_PER_WORD)
16049         {
16050           adjust = UNITS_PER_WORD  - size;
16051         }
16052     }
16053
16054   /* Get a local temporary for the field value.  */
16055   off = get_initialized_tmp_var (f_off, pre_p, NULL);
16056
16057   /* Emit code to branch if off >= 0.  */
16058   t = build2 (GE_EXPR, boolean_type_node, off,
16059               build_int_cst (TREE_TYPE (off), 0));
16060   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
16061
16062   if (dw_align)
16063     {
16064       /* Emit: offs = (offs + 15) & -16.  */
16065       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
16066                   build_int_cst (TREE_TYPE (off), 15));
16067       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
16068                   build_int_cst (TREE_TYPE (off), -16));
16069       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
16070     }
16071   else
16072     roundup = NULL;
16073
16074   /* Update ap.__[g|v]r_offs  */
16075   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
16076               build_int_cst (TREE_TYPE (off), rsize));
16077   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
16078
16079   /* String up.  */
16080   if (roundup)
16081     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
16082
16083   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
16084   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
16085               build_int_cst (TREE_TYPE (f_off), 0));
16086   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
16087
16088   /* String up: make sure the assignment happens before the use.  */
16089   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
16090   COND_EXPR_ELSE (cond1) = t;
16091
16092   /* Prepare the trees handling the argument that is passed on the stack;
16093      the top level node will store in ON_STACK.  */
16094   arg = get_initialized_tmp_var (stack, pre_p, NULL);
16095   if (align > 8)
16096     {
16097       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
16098       t = fold_build_pointer_plus_hwi (arg, 15);
16099       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
16100                   build_int_cst (TREE_TYPE (t), -16));
16101       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
16102     }
16103   else
16104     roundup = NULL;
16105   /* Advance ap.__stack  */
16106   t = fold_build_pointer_plus_hwi (arg, size + 7);
16107   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
16108               build_int_cst (TREE_TYPE (t), -8));
16109   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
16110   /* String up roundup and advance.  */
16111   if (roundup)
16112     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
16113   /* String up with arg */
16114   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
16115   /* Big-endianness related address adjustment.  */
16116   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
16117       && size < UNITS_PER_WORD)
16118   {
16119     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
16120                 size_int (UNITS_PER_WORD - size));
16121     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
16122   }
16123
16124   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
16125   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
16126
16127   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
16128   t = off;
16129   if (adjust)
16130     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
16131                 build_int_cst (TREE_TYPE (off), adjust));
16132
16133   t = fold_convert (sizetype, t);
16134   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
16135
16136   if (is_ha)
16137     {
16138       /* type ha; // treat as "struct {ftype field[n];}"
16139          ... [computing offs]
16140          for (i = 0; i <nregs; ++i, offs += 16)
16141            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
16142          return ha;  */
16143       int i;
16144       tree tmp_ha, field_t, field_ptr_t;
16145
16146       /* Declare a local variable.  */
16147       tmp_ha = create_tmp_var_raw (type, "ha");
16148       gimple_add_tmp_var (tmp_ha);
16149
16150       /* Establish the base type.  */
16151       switch (ag_mode)
16152         {
16153         case E_SFmode:
16154           field_t = float_type_node;
16155           field_ptr_t = float_ptr_type_node;
16156           break;
16157         case E_DFmode:
16158           field_t = double_type_node;
16159           field_ptr_t = double_ptr_type_node;
16160           break;
16161         case E_TFmode:
16162           field_t = long_double_type_node;
16163           field_ptr_t = long_double_ptr_type_node;
16164           break;
16165         case E_HFmode:
16166           field_t = aarch64_fp16_type_node;
16167           field_ptr_t = aarch64_fp16_ptr_type_node;
16168           break;
16169         case E_BFmode:
16170           field_t = aarch64_bf16_type_node;
16171           field_ptr_t = aarch64_bf16_ptr_type_node;
16172           break;
16173         case E_V2SImode:
16174         case E_V4SImode:
16175             {
16176               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
16177               field_t = build_vector_type_for_mode (innertype, ag_mode);
16178               field_ptr_t = build_pointer_type (field_t);
16179             }
16180           break;
16181         default:
16182           gcc_assert (0);
16183         }
16184
16185       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
16186       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
16187       addr = t;
16188       t = fold_convert (field_ptr_t, addr);
16189       t = build2 (MODIFY_EXPR, field_t,
16190                   build1 (INDIRECT_REF, field_t, tmp_ha),
16191                   build1 (INDIRECT_REF, field_t, t));
16192
16193       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
16194       for (i = 1; i < nregs; ++i)
16195         {
16196           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
16197           u = fold_convert (field_ptr_t, addr);
16198           u = build2 (MODIFY_EXPR, field_t,
16199                       build2 (MEM_REF, field_t, tmp_ha,
16200                               build_int_cst (field_ptr_t,
16201                                              (i *
16202                                               int_size_in_bytes (field_t)))),
16203                       build1 (INDIRECT_REF, field_t, u));
16204           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
16205         }
16206
16207       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
16208       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
16209     }
16210
16211   COND_EXPR_ELSE (cond2) = t;
16212   addr = fold_convert (build_pointer_type (type), cond1);
16213   addr = build_va_arg_indirect_ref (addr);
16214
16215   if (indirect_p)
16216     addr = build_va_arg_indirect_ref (addr);
16217
16218   return addr;
16219 }
16220
16221 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
16222
16223 static void
16224 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
16225                                 const function_arg_info &arg,
16226                                 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
16227 {
16228   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
16229   CUMULATIVE_ARGS local_cum;
16230   int gr_saved = cfun->va_list_gpr_size;
16231   int vr_saved = cfun->va_list_fpr_size;
16232
16233   /* The caller has advanced CUM up to, but not beyond, the last named
16234      argument.  Advance a local copy of CUM past the last "real" named
16235      argument, to find out how many registers are left over.  */
16236   local_cum = *cum;
16237   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
16238
16239   /* Found out how many registers we need to save.
16240      Honor tree-stdvar analysis results.  */
16241   if (cfun->va_list_gpr_size)
16242     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
16243                     cfun->va_list_gpr_size / UNITS_PER_WORD);
16244   if (cfun->va_list_fpr_size)
16245     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
16246                     cfun->va_list_fpr_size / UNITS_PER_VREG);
16247
16248   if (!TARGET_FLOAT)
16249     {
16250       gcc_assert (local_cum.aapcs_nvrn == 0);
16251       vr_saved = 0;
16252     }
16253
16254   if (!no_rtl)
16255     {
16256       if (gr_saved > 0)
16257         {
16258           rtx ptr, mem;
16259
16260           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
16261           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
16262                                - gr_saved * UNITS_PER_WORD);
16263           mem = gen_frame_mem (BLKmode, ptr);
16264           set_mem_alias_set (mem, get_varargs_alias_set ());
16265
16266           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
16267                                mem, gr_saved);
16268         }
16269       if (vr_saved > 0)
16270         {
16271           /* We can't use move_block_from_reg, because it will use
16272              the wrong mode, storing D regs only.  */
16273           machine_mode mode = TImode;
16274           int off, i, vr_start;
16275
16276           /* Set OFF to the offset from virtual_incoming_args_rtx of
16277              the first vector register.  The VR save area lies below
16278              the GR one, and is aligned to 16 bytes.  */
16279           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
16280                            STACK_BOUNDARY / BITS_PER_UNIT);
16281           off -= vr_saved * UNITS_PER_VREG;
16282
16283           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
16284           for (i = 0; i < vr_saved; ++i)
16285             {
16286               rtx ptr, mem;
16287
16288               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
16289               mem = gen_frame_mem (mode, ptr);
16290               set_mem_alias_set (mem, get_varargs_alias_set ());
16291               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
16292               off += UNITS_PER_VREG;
16293             }
16294         }
16295     }
16296
16297   /* We don't save the size into *PRETEND_SIZE because we want to avoid
16298      any complication of having crtl->args.pretend_args_size changed.  */
16299   cfun->machine->frame.saved_varargs_size
16300     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
16301                  STACK_BOUNDARY / BITS_PER_UNIT)
16302        + vr_saved * UNITS_PER_VREG);
16303 }
16304
16305 static void
16306 aarch64_conditional_register_usage (void)
16307 {
16308   int i;
16309   if (!TARGET_FLOAT)
16310     {
16311       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
16312         {
16313           fixed_regs[i] = 1;
16314           call_used_regs[i] = 1;
16315         }
16316     }
16317   if (!TARGET_SVE)
16318     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
16319       {
16320         fixed_regs[i] = 1;
16321         call_used_regs[i] = 1;
16322       }
16323
16324   /* Only allow the FFR and FFRT to be accessed via special patterns.  */
16325   CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
16326   CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
16327
16328   /* When tracking speculation, we need a couple of call-clobbered registers
16329      to track the speculation state.  It would be nice to just use
16330      IP0 and IP1, but currently there are numerous places that just
16331      assume these registers are free for other uses (eg pointer
16332      authentication).  */
16333   if (aarch64_track_speculation)
16334     {
16335       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
16336       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
16337       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
16338       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
16339     }
16340 }
16341
16342 /* Implement TARGET_MEMBER_TYPE_FORCES_BLK.  */
16343
16344 bool
16345 aarch64_member_type_forces_blk (const_tree field_or_array, machine_mode mode)
16346 {
16347   /* For records we're passed a FIELD_DECL, for arrays we're passed
16348      an ARRAY_TYPE.  In both cases we're interested in the TREE_TYPE.  */
16349   const_tree type = TREE_TYPE (field_or_array);
16350
16351   /* Assign BLKmode to anything that contains multiple SVE predicates.
16352      For structures, the "multiple" case is indicated by MODE being
16353      VOIDmode.  */
16354   unsigned int num_zr, num_pr;
16355   if (aarch64_sve::builtin_type_p (type, &num_zr, &num_pr) && num_pr != 0)
16356     {
16357       if (TREE_CODE (field_or_array) == ARRAY_TYPE)
16358         return !simple_cst_equal (TYPE_SIZE (field_or_array),
16359                                   TYPE_SIZE (type));
16360       return mode == VOIDmode;
16361     }
16362
16363   return default_member_type_forces_blk (field_or_array, mode);
16364 }
16365
16366 /* Walk down the type tree of TYPE counting consecutive base elements.
16367    If *MODEP is VOIDmode, then set it to the first valid floating point
16368    type.  If a non-floating point type is found, or if a floating point
16369    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
16370    otherwise return the count in the sub-tree.  */
16371 static int
16372 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
16373 {
16374   machine_mode mode;
16375   HOST_WIDE_INT size;
16376
16377   if (aarch64_sve::builtin_type_p (type))
16378     return -1;
16379
16380   switch (TREE_CODE (type))
16381     {
16382     case REAL_TYPE:
16383       mode = TYPE_MODE (type);
16384       if (mode != DFmode && mode != SFmode
16385           && mode != TFmode && mode != HFmode)
16386         return -1;
16387
16388       if (*modep == VOIDmode)
16389         *modep = mode;
16390
16391       if (*modep == mode)
16392         return 1;
16393
16394       break;
16395
16396     case COMPLEX_TYPE:
16397       mode = TYPE_MODE (TREE_TYPE (type));
16398       if (mode != DFmode && mode != SFmode
16399           && mode != TFmode && mode != HFmode)
16400         return -1;
16401
16402       if (*modep == VOIDmode)
16403         *modep = mode;
16404
16405       if (*modep == mode)
16406         return 2;
16407
16408       break;
16409
16410     case VECTOR_TYPE:
16411       /* Use V2SImode and V4SImode as representatives of all 64-bit
16412          and 128-bit vector types.  */
16413       size = int_size_in_bytes (type);
16414       switch (size)
16415         {
16416         case 8:
16417           mode = V2SImode;
16418           break;
16419         case 16:
16420           mode = V4SImode;
16421           break;
16422         default:
16423           return -1;
16424         }
16425
16426       if (*modep == VOIDmode)
16427         *modep = mode;
16428
16429       /* Vector modes are considered to be opaque: two vectors are
16430          equivalent for the purposes of being homogeneous aggregates
16431          if they are the same size.  */
16432       if (*modep == mode)
16433         return 1;
16434
16435       break;
16436
16437     case ARRAY_TYPE:
16438       {
16439         int count;
16440         tree index = TYPE_DOMAIN (type);
16441
16442         /* Can't handle incomplete types nor sizes that are not
16443            fixed.  */
16444         if (!COMPLETE_TYPE_P (type)
16445             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
16446           return -1;
16447
16448         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
16449         if (count == -1
16450             || !index
16451             || !TYPE_MAX_VALUE (index)
16452             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
16453             || !TYPE_MIN_VALUE (index)
16454             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
16455             || count < 0)
16456           return -1;
16457
16458         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
16459                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
16460
16461         /* There must be no padding.  */
16462         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
16463                       count * GET_MODE_BITSIZE (*modep)))
16464           return -1;
16465
16466         return count;
16467       }
16468
16469     case RECORD_TYPE:
16470       {
16471         int count = 0;
16472         int sub_count;
16473         tree field;
16474
16475         /* Can't handle incomplete types nor sizes that are not
16476            fixed.  */
16477         if (!COMPLETE_TYPE_P (type)
16478             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
16479           return -1;
16480
16481         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
16482           {
16483             if (TREE_CODE (field) != FIELD_DECL)
16484               continue;
16485
16486             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
16487             if (sub_count < 0)
16488               return -1;
16489             count += sub_count;
16490           }
16491
16492         /* There must be no padding.  */
16493         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
16494                       count * GET_MODE_BITSIZE (*modep)))
16495           return -1;
16496
16497         return count;
16498       }
16499
16500     case UNION_TYPE:
16501     case QUAL_UNION_TYPE:
16502       {
16503         /* These aren't very interesting except in a degenerate case.  */
16504         int count = 0;
16505         int sub_count;
16506         tree field;
16507
16508         /* Can't handle incomplete types nor sizes that are not
16509            fixed.  */
16510         if (!COMPLETE_TYPE_P (type)
16511             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
16512           return -1;
16513
16514         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
16515           {
16516             if (TREE_CODE (field) != FIELD_DECL)
16517               continue;
16518
16519             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
16520             if (sub_count < 0)
16521               return -1;
16522             count = count > sub_count ? count : sub_count;
16523           }
16524
16525         /* There must be no padding.  */
16526         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
16527                       count * GET_MODE_BITSIZE (*modep)))
16528           return -1;
16529
16530         return count;
16531       }
16532
16533     default:
16534       break;
16535     }
16536
16537   return -1;
16538 }
16539
16540 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
16541    type as described in AAPCS64 \S 4.1.2.
16542
16543    See the comment above aarch64_composite_type_p for the notes on MODE.  */
16544
16545 static bool
16546 aarch64_short_vector_p (const_tree type,
16547                         machine_mode mode)
16548 {
16549   poly_int64 size = -1;
16550
16551   if (type && TREE_CODE (type) == VECTOR_TYPE)
16552     {
16553       if (aarch64_sve::builtin_type_p (type))
16554         return false;
16555       size = int_size_in_bytes (type);
16556     }
16557   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
16558            || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
16559     {
16560       /* Rely only on the type, not the mode, when processing SVE types.  */
16561       if (type && aarch64_some_values_include_pst_objects_p (type))
16562         gcc_assert (aarch64_sve_mode_p (mode));
16563       else
16564         size = GET_MODE_SIZE (mode);
16565     }
16566   if (known_eq (size, 8) || known_eq (size, 16))
16567     {
16568       /* 64-bit and 128-bit vectors should only acquire an SVE mode if
16569          they are being treated as scalable AAPCS64 types.  */
16570       gcc_assert (!aarch64_sve_mode_p (mode));
16571       return true;
16572     }
16573   return false;
16574 }
16575
16576 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
16577    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
16578    array types.  The C99 floating-point complex types are also considered
16579    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
16580    types, which are GCC extensions and out of the scope of AAPCS64, are
16581    treated as composite types here as well.
16582
16583    Note that MODE itself is not sufficient in determining whether a type
16584    is such a composite type or not.  This is because
16585    stor-layout.c:compute_record_mode may have already changed the MODE
16586    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
16587    structure with only one field may have its MODE set to the mode of the
16588    field.  Also an integer mode whose size matches the size of the
16589    RECORD_TYPE type may be used to substitute the original mode
16590    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
16591    solely relied on.  */
16592
16593 static bool
16594 aarch64_composite_type_p (const_tree type,
16595                           machine_mode mode)
16596 {
16597   if (aarch64_short_vector_p (type, mode))
16598     return false;
16599
16600   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
16601     return true;
16602
16603   if (mode == BLKmode
16604       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
16605       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
16606     return true;
16607
16608   return false;
16609 }
16610
16611 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
16612    shall be passed or returned in simd/fp register(s) (providing these
16613    parameter passing registers are available).
16614
16615    Upon successful return, *COUNT returns the number of needed registers,
16616    *BASE_MODE returns the mode of the individual register and when IS_HAF
16617    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
16618    floating-point aggregate or a homogeneous short-vector aggregate.  */
16619
16620 static bool
16621 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
16622                                          const_tree type,
16623                                          machine_mode *base_mode,
16624                                          int *count,
16625                                          bool *is_ha)
16626 {
16627   if (is_ha != NULL) *is_ha = false;
16628
16629   machine_mode new_mode = VOIDmode;
16630   bool composite_p = aarch64_composite_type_p (type, mode);
16631
16632   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
16633       || aarch64_short_vector_p (type, mode))
16634     {
16635       *count = 1;
16636       new_mode = mode;
16637     }
16638   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
16639     {
16640       if (is_ha != NULL) *is_ha = true;
16641       *count = 2;
16642       new_mode = GET_MODE_INNER (mode);
16643     }
16644   else if (type && composite_p)
16645     {
16646       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
16647
16648       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
16649         {
16650           if (is_ha != NULL) *is_ha = true;
16651           *count = ag_count;
16652         }
16653       else
16654         return false;
16655     }
16656   else
16657     return false;
16658
16659   gcc_assert (!aarch64_sve_mode_p (new_mode));
16660   *base_mode = new_mode;
16661   return true;
16662 }
16663
16664 /* Implement TARGET_STRUCT_VALUE_RTX.  */
16665
16666 static rtx
16667 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
16668                           int incoming ATTRIBUTE_UNUSED)
16669 {
16670   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
16671 }
16672
16673 /* Implements target hook vector_mode_supported_p.  */
16674 static bool
16675 aarch64_vector_mode_supported_p (machine_mode mode)
16676 {
16677   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
16678   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
16679 }
16680
16681 /* Return the full-width SVE vector mode for element mode MODE, if one
16682    exists.  */
16683 opt_machine_mode
16684 aarch64_full_sve_mode (scalar_mode mode)
16685 {
16686   switch (mode)
16687     {
16688     case E_DFmode:
16689       return VNx2DFmode;
16690     case E_SFmode:
16691       return VNx4SFmode;
16692     case E_HFmode:
16693       return VNx8HFmode;
16694     case E_BFmode:
16695       return VNx8BFmode;
16696     case E_DImode:
16697       return VNx2DImode;
16698     case E_SImode:
16699       return VNx4SImode;
16700     case E_HImode:
16701       return VNx8HImode;
16702     case E_QImode:
16703       return VNx16QImode;
16704     default:
16705       return opt_machine_mode ();
16706     }
16707 }
16708
16709 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
16710    if it exists.  */
16711 opt_machine_mode
16712 aarch64_vq_mode (scalar_mode mode)
16713 {
16714   switch (mode)
16715     {
16716     case E_DFmode:
16717       return V2DFmode;
16718     case E_SFmode:
16719       return V4SFmode;
16720     case E_HFmode:
16721       return V8HFmode;
16722     case E_BFmode:
16723       return V8BFmode;
16724     case E_SImode:
16725       return V4SImode;
16726     case E_HImode:
16727       return V8HImode;
16728     case E_QImode:
16729       return V16QImode;
16730     case E_DImode:
16731       return V2DImode;
16732     default:
16733       return opt_machine_mode ();
16734     }
16735 }
16736
16737 /* Return appropriate SIMD container
16738    for MODE within a vector of WIDTH bits.  */
16739 static machine_mode
16740 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
16741 {
16742   if (TARGET_SVE
16743       && maybe_ne (width, 128)
16744       && known_eq (width, BITS_PER_SVE_VECTOR))
16745     return aarch64_full_sve_mode (mode).else_mode (word_mode);
16746
16747   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
16748   if (TARGET_SIMD)
16749     {
16750       if (known_eq (width, 128))
16751         return aarch64_vq_mode (mode).else_mode (word_mode);
16752       else
16753         switch (mode)
16754           {
16755           case E_SFmode:
16756             return V2SFmode;
16757           case E_HFmode:
16758             return V4HFmode;
16759           case E_BFmode:
16760             return V4BFmode;
16761           case E_SImode:
16762             return V2SImode;
16763           case E_HImode:
16764             return V4HImode;
16765           case E_QImode:
16766             return V8QImode;
16767           default:
16768             break;
16769           }
16770     }
16771   return word_mode;
16772 }
16773
16774 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
16775 static machine_mode
16776 aarch64_preferred_simd_mode (scalar_mode mode)
16777 {
16778   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
16779   return aarch64_simd_container_mode (mode, bits);
16780 }
16781
16782 /* Return a list of possible vector sizes for the vectorizer
16783    to iterate over.  */
16784 static unsigned int
16785 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
16786 {
16787   static const machine_mode sve_modes[] = {
16788     /* Try using full vectors for all element types.  */
16789     VNx16QImode,
16790
16791     /* Try using 16-bit containers for 8-bit elements and full vectors
16792        for wider elements.  */
16793     VNx8QImode,
16794
16795     /* Try using 32-bit containers for 8-bit and 16-bit elements and
16796        full vectors for wider elements.  */
16797     VNx4QImode,
16798
16799     /* Try using 64-bit containers for all element types.  */
16800     VNx2QImode
16801   };
16802
16803   static const machine_mode advsimd_modes[] = {
16804     /* Try using 128-bit vectors for all element types.  */
16805     V16QImode,
16806
16807     /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
16808        for wider elements.  */
16809     V8QImode,
16810
16811     /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
16812        for wider elements.
16813
16814        TODO: We could support a limited form of V4QImode too, so that
16815        we use 32-bit vectors for 8-bit elements.  */
16816     V4HImode,
16817
16818     /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
16819        for 64-bit elements.
16820
16821        TODO: We could similarly support limited forms of V2QImode and V2HImode
16822        for this case.  */
16823     V2SImode
16824   };
16825
16826   /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
16827      This is because:
16828
16829      - If we can't use N-byte Advanced SIMD vectors then the placement
16830        doesn't matter; we'll just continue as though the Advanced SIMD
16831        entry didn't exist.
16832
16833      - If an SVE main loop with N bytes ends up being cheaper than an
16834        Advanced SIMD main loop with N bytes then by default we'll replace
16835        the Advanced SIMD version with the SVE one.
16836
16837      - If an Advanced SIMD main loop with N bytes ends up being cheaper
16838        than an SVE main loop with N bytes then by default we'll try to
16839        use the SVE loop to vectorize the epilogue instead.  */
16840   unsigned int sve_i = TARGET_SVE ? 0 : ARRAY_SIZE (sve_modes);
16841   unsigned int advsimd_i = 0;
16842   while (advsimd_i < ARRAY_SIZE (advsimd_modes))
16843     {
16844       if (sve_i < ARRAY_SIZE (sve_modes)
16845           && maybe_gt (GET_MODE_NUNITS (sve_modes[sve_i]),
16846                        GET_MODE_NUNITS (advsimd_modes[advsimd_i])))
16847         modes->safe_push (sve_modes[sve_i++]);
16848       else
16849         modes->safe_push (advsimd_modes[advsimd_i++]);
16850     }
16851   while (sve_i < ARRAY_SIZE (sve_modes))
16852     modes->safe_push (sve_modes[sve_i++]);
16853
16854   unsigned int flags = 0;
16855   /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
16856      can compare SVE against Advanced SIMD and so that we can compare
16857      multiple SVE vectorization approaches against each other.  There's
16858      not really any point doing this for Advanced SIMD only, since the
16859      first mode that works should always be the best.  */
16860   if (TARGET_SVE && aarch64_sve_compare_costs)
16861     flags |= VECT_COMPARE_COSTS;
16862   return flags;
16863 }
16864
16865 /* Implement TARGET_MANGLE_TYPE.  */
16866
16867 static const char *
16868 aarch64_mangle_type (const_tree type)
16869 {
16870   /* The AArch64 ABI documents say that "__va_list" has to be
16871      mangled as if it is in the "std" namespace.  */
16872   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
16873     return "St9__va_list";
16874
16875   /* Half-precision floating point types.  */
16876   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
16877     {
16878       if (TYPE_MODE (type) == BFmode)
16879         return "u6__bf16";
16880       else
16881         return "Dh";
16882     }
16883
16884   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
16885      builtin types.  */
16886   if (TYPE_NAME (type) != NULL)
16887     {
16888       const char *res;
16889       if ((res = aarch64_general_mangle_builtin_type (type))
16890           || (res = aarch64_sve::mangle_builtin_type (type)))
16891         return res;
16892     }
16893
16894   /* Use the default mangling.  */
16895   return NULL;
16896 }
16897
16898 /* Implement TARGET_VERIFY_TYPE_CONTEXT.  */
16899
16900 static bool
16901 aarch64_verify_type_context (location_t loc, type_context_kind context,
16902                              const_tree type, bool silent_p)
16903 {
16904   return aarch64_sve::verify_type_context (loc, context, type, silent_p);
16905 }
16906
16907 /* Find the first rtx_insn before insn that will generate an assembly
16908    instruction.  */
16909
16910 static rtx_insn *
16911 aarch64_prev_real_insn (rtx_insn *insn)
16912 {
16913   if (!insn)
16914     return NULL;
16915
16916   do
16917     {
16918       insn = prev_real_insn (insn);
16919     }
16920   while (insn && recog_memoized (insn) < 0);
16921
16922   return insn;
16923 }
16924
16925 static bool
16926 is_madd_op (enum attr_type t1)
16927 {
16928   unsigned int i;
16929   /* A number of these may be AArch32 only.  */
16930   enum attr_type mlatypes[] = {
16931     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
16932     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
16933     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
16934   };
16935
16936   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
16937     {
16938       if (t1 == mlatypes[i])
16939         return true;
16940     }
16941
16942   return false;
16943 }
16944
16945 /* Check if there is a register dependency between a load and the insn
16946    for which we hold recog_data.  */
16947
16948 static bool
16949 dep_between_memop_and_curr (rtx memop)
16950 {
16951   rtx load_reg;
16952   int opno;
16953
16954   gcc_assert (GET_CODE (memop) == SET);
16955
16956   if (!REG_P (SET_DEST (memop)))
16957     return false;
16958
16959   load_reg = SET_DEST (memop);
16960   for (opno = 1; opno < recog_data.n_operands; opno++)
16961     {
16962       rtx operand = recog_data.operand[opno];
16963       if (REG_P (operand)
16964           && reg_overlap_mentioned_p (load_reg, operand))
16965         return true;
16966
16967     }
16968   return false;
16969 }
16970
16971
16972 /* When working around the Cortex-A53 erratum 835769,
16973    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
16974    instruction and has a preceding memory instruction such that a NOP
16975    should be inserted between them.  */
16976
16977 bool
16978 aarch64_madd_needs_nop (rtx_insn* insn)
16979 {
16980   enum attr_type attr_type;
16981   rtx_insn *prev;
16982   rtx body;
16983
16984   if (!TARGET_FIX_ERR_A53_835769)
16985     return false;
16986
16987   if (!INSN_P (insn) || recog_memoized (insn) < 0)
16988     return false;
16989
16990   attr_type = get_attr_type (insn);
16991   if (!is_madd_op (attr_type))
16992     return false;
16993
16994   prev = aarch64_prev_real_insn (insn);
16995   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
16996      Restore recog state to INSN to avoid state corruption.  */
16997   extract_constrain_insn_cached (insn);
16998
16999   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
17000     return false;
17001
17002   body = single_set (prev);
17003
17004   /* If the previous insn is a memory op and there is no dependency between
17005      it and the DImode madd, emit a NOP between them.  If body is NULL then we
17006      have a complex memory operation, probably a load/store pair.
17007      Be conservative for now and emit a NOP.  */
17008   if (GET_MODE (recog_data.operand[0]) == DImode
17009       && (!body || !dep_between_memop_and_curr (body)))
17010     return true;
17011
17012   return false;
17013
17014 }
17015
17016
17017 /* Implement FINAL_PRESCAN_INSN.  */
17018
17019 void
17020 aarch64_final_prescan_insn (rtx_insn *insn)
17021 {
17022   if (aarch64_madd_needs_nop (insn))
17023     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
17024 }
17025
17026
17027 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
17028    instruction.  */
17029
17030 bool
17031 aarch64_sve_index_immediate_p (rtx base_or_step)
17032 {
17033   return (CONST_INT_P (base_or_step)
17034           && IN_RANGE (INTVAL (base_or_step), -16, 15));
17035 }
17036
17037 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
17038    when applied to mode MODE.  Negate X first if NEGATE_P is true.  */
17039
17040 bool
17041 aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
17042 {
17043   rtx elt = unwrap_const_vec_duplicate (x);
17044   if (!CONST_INT_P (elt))
17045     return false;
17046
17047   HOST_WIDE_INT val = INTVAL (elt);
17048   if (negate_p)
17049     val = -val;
17050   val &= GET_MODE_MASK (GET_MODE_INNER (mode));
17051
17052   if (val & 0xff)
17053     return IN_RANGE (val, 0, 0xff);
17054   return IN_RANGE (val, 0, 0xff00);
17055 }
17056
17057 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
17058    instructions when applied to mode MODE.  Negate X first if NEGATE_P
17059    is true.  */
17060
17061 bool
17062 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
17063 {
17064   if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
17065     return false;
17066
17067   /* After the optional negation, the immediate must be nonnegative.
17068      E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
17069      instead of SQADD Zn.B, Zn.B, #129.  */
17070   rtx elt = unwrap_const_vec_duplicate (x);
17071   return negate_p == (INTVAL (elt) < 0);
17072 }
17073
17074 /* Return true if X is a valid immediate operand for an SVE logical
17075    instruction such as AND.  */
17076
17077 bool
17078 aarch64_sve_bitmask_immediate_p (rtx x)
17079 {
17080   rtx elt;
17081
17082   return (const_vec_duplicate_p (x, &elt)
17083           && CONST_INT_P (elt)
17084           && aarch64_bitmask_imm (INTVAL (elt),
17085                                   GET_MODE_INNER (GET_MODE (x))));
17086 }
17087
17088 /* Return true if X is a valid immediate for the SVE DUP and CPY
17089    instructions.  */
17090
17091 bool
17092 aarch64_sve_dup_immediate_p (rtx x)
17093 {
17094   x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
17095   if (!CONST_INT_P (x))
17096     return false;
17097
17098   HOST_WIDE_INT val = INTVAL (x);
17099   if (val & 0xff)
17100     return IN_RANGE (val, -0x80, 0x7f);
17101   return IN_RANGE (val, -0x8000, 0x7f00);
17102 }
17103
17104 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
17105    SIGNED_P says whether the operand is signed rather than unsigned.  */
17106
17107 bool
17108 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
17109 {
17110   x = unwrap_const_vec_duplicate (x);
17111   return (CONST_INT_P (x)
17112           && (signed_p
17113               ? IN_RANGE (INTVAL (x), -16, 15)
17114               : IN_RANGE (INTVAL (x), 0, 127)));
17115 }
17116
17117 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
17118    instruction.  Negate X first if NEGATE_P is true.  */
17119
17120 bool
17121 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
17122 {
17123   rtx elt;
17124   REAL_VALUE_TYPE r;
17125
17126   if (!const_vec_duplicate_p (x, &elt)
17127       || GET_CODE (elt) != CONST_DOUBLE)
17128     return false;
17129
17130   r = *CONST_DOUBLE_REAL_VALUE (elt);
17131
17132   if (negate_p)
17133     r = real_value_negate (&r);
17134
17135   if (real_equal (&r, &dconst1))
17136     return true;
17137   if (real_equal (&r, &dconsthalf))
17138     return true;
17139   return false;
17140 }
17141
17142 /* Return true if X is a valid immediate operand for an SVE FMUL
17143    instruction.  */
17144
17145 bool
17146 aarch64_sve_float_mul_immediate_p (rtx x)
17147 {
17148   rtx elt;
17149
17150   return (const_vec_duplicate_p (x, &elt)
17151           && GET_CODE (elt) == CONST_DOUBLE
17152           && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
17153               || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
17154 }
17155
17156 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
17157    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
17158    is nonnull, use it to describe valid immediates.  */
17159 static bool
17160 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
17161                                     simd_immediate_info *info,
17162                                     enum simd_immediate_check which,
17163                                     simd_immediate_info::insn_type insn)
17164 {
17165   /* Try a 4-byte immediate with LSL.  */
17166   for (unsigned int shift = 0; shift < 32; shift += 8)
17167     if ((val32 & (0xff << shift)) == val32)
17168       {
17169         if (info)
17170           *info = simd_immediate_info (SImode, val32 >> shift, insn,
17171                                        simd_immediate_info::LSL, shift);
17172         return true;
17173       }
17174
17175   /* Try a 2-byte immediate with LSL.  */
17176   unsigned int imm16 = val32 & 0xffff;
17177   if (imm16 == (val32 >> 16))
17178     for (unsigned int shift = 0; shift < 16; shift += 8)
17179       if ((imm16 & (0xff << shift)) == imm16)
17180         {
17181           if (info)
17182             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
17183                                          simd_immediate_info::LSL, shift);
17184           return true;
17185         }
17186
17187   /* Try a 4-byte immediate with MSL, except for cases that MVN
17188      can handle.  */
17189   if (which == AARCH64_CHECK_MOV)
17190     for (unsigned int shift = 8; shift < 24; shift += 8)
17191       {
17192         unsigned int low = (1 << shift) - 1;
17193         if (((val32 & (0xff << shift)) | low) == val32)
17194           {
17195             if (info)
17196               *info = simd_immediate_info (SImode, val32 >> shift, insn,
17197                                            simd_immediate_info::MSL, shift);
17198             return true;
17199           }
17200       }
17201
17202   return false;
17203 }
17204
17205 /* Return true if replicating VAL64 is a valid immediate for the
17206    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
17207    use it to describe valid immediates.  */
17208 static bool
17209 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
17210                                  simd_immediate_info *info,
17211                                  enum simd_immediate_check which)
17212 {
17213   unsigned int val32 = val64 & 0xffffffff;
17214   unsigned int val16 = val64 & 0xffff;
17215   unsigned int val8 = val64 & 0xff;
17216
17217   if (val32 == (val64 >> 32))
17218     {
17219       if ((which & AARCH64_CHECK_ORR) != 0
17220           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
17221                                                  simd_immediate_info::MOV))
17222         return true;
17223
17224       if ((which & AARCH64_CHECK_BIC) != 0
17225           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
17226                                                  simd_immediate_info::MVN))
17227         return true;
17228
17229       /* Try using a replicated byte.  */
17230       if (which == AARCH64_CHECK_MOV
17231           && val16 == (val32 >> 16)
17232           && val8 == (val16 >> 8))
17233         {
17234           if (info)
17235             *info = simd_immediate_info (QImode, val8);
17236           return true;
17237         }
17238     }
17239
17240   /* Try using a bit-to-bytemask.  */
17241   if (which == AARCH64_CHECK_MOV)
17242     {
17243       unsigned int i;
17244       for (i = 0; i < 64; i += 8)
17245         {
17246           unsigned char byte = (val64 >> i) & 0xff;
17247           if (byte != 0 && byte != 0xff)
17248             break;
17249         }
17250       if (i == 64)
17251         {
17252           if (info)
17253             *info = simd_immediate_info (DImode, val64);
17254           return true;
17255         }
17256     }
17257   return false;
17258 }
17259
17260 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
17261    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
17262
17263 static bool
17264 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
17265                              simd_immediate_info *info)
17266 {
17267   scalar_int_mode mode = DImode;
17268   unsigned int val32 = val64 & 0xffffffff;
17269   if (val32 == (val64 >> 32))
17270     {
17271       mode = SImode;
17272       unsigned int val16 = val32 & 0xffff;
17273       if (val16 == (val32 >> 16))
17274         {
17275           mode = HImode;
17276           unsigned int val8 = val16 & 0xff;
17277           if (val8 == (val16 >> 8))
17278             mode = QImode;
17279         }
17280     }
17281   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
17282   if (IN_RANGE (val, -0x80, 0x7f))
17283     {
17284       /* DUP with no shift.  */
17285       if (info)
17286         *info = simd_immediate_info (mode, val);
17287       return true;
17288     }
17289   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
17290     {
17291       /* DUP with LSL #8.  */
17292       if (info)
17293         *info = simd_immediate_info (mode, val);
17294       return true;
17295     }
17296   if (aarch64_bitmask_imm (val64, mode))
17297     {
17298       /* DUPM.  */
17299       if (info)
17300         *info = simd_immediate_info (mode, val);
17301       return true;
17302     }
17303   return false;
17304 }
17305
17306 /* Return true if X is an UNSPEC_PTRUE constant of the form:
17307
17308        (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
17309
17310    where PATTERN is the svpattern as a CONST_INT and where ZERO
17311    is a zero constant of the required PTRUE mode (which can have
17312    fewer elements than X's mode, if zero bits are significant).
17313
17314    If so, and if INFO is nonnull, describe the immediate in INFO.  */
17315 bool
17316 aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
17317 {
17318   if (GET_CODE (x) != CONST)
17319     return false;
17320
17321   x = XEXP (x, 0);
17322   if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
17323     return false;
17324
17325   if (info)
17326     {
17327       aarch64_svpattern pattern
17328         = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
17329       machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
17330       scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
17331       *info = simd_immediate_info (int_mode, pattern);
17332     }
17333   return true;
17334 }
17335
17336 /* Return true if X is a valid SVE predicate.  If INFO is nonnull, use
17337    it to describe valid immediates.  */
17338
17339 static bool
17340 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
17341 {
17342   if (aarch64_sve_ptrue_svpattern_p (x, info))
17343     return true;
17344
17345   if (x == CONST0_RTX (GET_MODE (x)))
17346     {
17347       if (info)
17348         *info = simd_immediate_info (DImode, 0);
17349       return true;
17350     }
17351
17352   /* Analyze the value as a VNx16BImode.  This should be relatively
17353      efficient, since rtx_vector_builder has enough built-in capacity
17354      to store all VLA predicate constants without needing the heap.  */
17355   rtx_vector_builder builder;
17356   if (!aarch64_get_sve_pred_bits (builder, x))
17357     return false;
17358
17359   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
17360   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
17361     {
17362       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
17363       aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
17364       if (pattern != AARCH64_NUM_SVPATTERNS)
17365         {
17366           if (info)
17367             {
17368               scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
17369               *info = simd_immediate_info (int_mode, pattern);
17370             }
17371           return true;
17372         }
17373     }
17374   return false;
17375 }
17376
17377 /* Return true if OP is a valid SIMD immediate for the operation
17378    described by WHICH.  If INFO is nonnull, use it to describe valid
17379    immediates.  */
17380 bool
17381 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
17382                               enum simd_immediate_check which)
17383 {
17384   machine_mode mode = GET_MODE (op);
17385   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17386   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
17387     return false;
17388
17389   if (vec_flags & VEC_SVE_PRED)
17390     return aarch64_sve_pred_valid_immediate (op, info);
17391
17392   scalar_mode elt_mode = GET_MODE_INNER (mode);
17393   rtx base, step;
17394   unsigned int n_elts;
17395   if (GET_CODE (op) == CONST_VECTOR
17396       && CONST_VECTOR_DUPLICATE_P (op))
17397     n_elts = CONST_VECTOR_NPATTERNS (op);
17398   else if ((vec_flags & VEC_SVE_DATA)
17399            && const_vec_series_p (op, &base, &step))
17400     {
17401       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
17402       if (!aarch64_sve_index_immediate_p (base)
17403           || !aarch64_sve_index_immediate_p (step))
17404         return false;
17405
17406       if (info)
17407         {
17408           /* Get the corresponding container mode.  E.g. an INDEX on V2SI
17409              should yield two integer values per 128-bit block, meaning
17410              that we need to treat it in the same way as V2DI and then
17411              ignore the upper 32 bits of each element.  */
17412           elt_mode = aarch64_sve_container_int_mode (mode);
17413           *info = simd_immediate_info (elt_mode, base, step);
17414         }
17415       return true;
17416     }
17417   else if (GET_CODE (op) == CONST_VECTOR
17418            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
17419     /* N_ELTS set above.  */;
17420   else
17421     return false;
17422
17423   scalar_float_mode elt_float_mode;
17424   if (n_elts == 1
17425       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
17426     {
17427       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
17428       if (aarch64_float_const_zero_rtx_p (elt)
17429           || aarch64_float_const_representable_p (elt))
17430         {
17431           if (info)
17432             *info = simd_immediate_info (elt_float_mode, elt);
17433           return true;
17434         }
17435     }
17436
17437   /* If all elements in an SVE vector have the same value, we have a free
17438      choice between using the element mode and using the container mode.
17439      Using the element mode means that unused parts of the vector are
17440      duplicates of the used elements, while using the container mode means
17441      that the unused parts are an extension of the used elements.  Using the
17442      element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
17443      for its container mode VNx4SI while 0x00000101 isn't.
17444
17445      If not all elements in an SVE vector have the same value, we need the
17446      transition from one element to the next to occur at container boundaries.
17447      E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
17448      in the same way as a VNx4SI containing { 1, 2, 3, 4 }.  */
17449   scalar_int_mode elt_int_mode;
17450   if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
17451     elt_int_mode = aarch64_sve_container_int_mode (mode);
17452   else
17453     elt_int_mode = int_mode_for_mode (elt_mode).require ();
17454
17455   unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
17456   if (elt_size > 8)
17457     return false;
17458
17459   /* Expand the vector constant out into a byte vector, with the least
17460      significant byte of the register first.  */
17461   auto_vec<unsigned char, 16> bytes;
17462   bytes.reserve (n_elts * elt_size);
17463   for (unsigned int i = 0; i < n_elts; i++)
17464     {
17465       /* The vector is provided in gcc endian-neutral fashion.
17466          For aarch64_be Advanced SIMD, it must be laid out in the vector
17467          register in reverse order.  */
17468       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
17469       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
17470
17471       if (elt_mode != elt_int_mode)
17472         elt = gen_lowpart (elt_int_mode, elt);
17473
17474       if (!CONST_INT_P (elt))
17475         return false;
17476
17477       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
17478       for (unsigned int byte = 0; byte < elt_size; byte++)
17479         {
17480           bytes.quick_push (elt_val & 0xff);
17481           elt_val >>= BITS_PER_UNIT;
17482         }
17483     }
17484
17485   /* The immediate must repeat every eight bytes.  */
17486   unsigned int nbytes = bytes.length ();
17487   for (unsigned i = 8; i < nbytes; ++i)
17488     if (bytes[i] != bytes[i - 8])
17489       return false;
17490
17491   /* Get the repeating 8-byte value as an integer.  No endian correction
17492      is needed here because bytes is already in lsb-first order.  */
17493   unsigned HOST_WIDE_INT val64 = 0;
17494   for (unsigned int i = 0; i < 8; i++)
17495     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
17496               << (i * BITS_PER_UNIT));
17497
17498   if (vec_flags & VEC_SVE_DATA)
17499     return aarch64_sve_valid_immediate (val64, info);
17500   else
17501     return aarch64_advsimd_valid_immediate (val64, info, which);
17502 }
17503
17504 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
17505    has a step in the range of INDEX.  Return the index expression if so,
17506    otherwise return null.  */
17507 rtx
17508 aarch64_check_zero_based_sve_index_immediate (rtx x)
17509 {
17510   rtx base, step;
17511   if (const_vec_series_p (x, &base, &step)
17512       && base == const0_rtx
17513       && aarch64_sve_index_immediate_p (step))
17514     return step;
17515   return NULL_RTX;
17516 }
17517
17518 /* Check of immediate shift constants are within range.  */
17519 bool
17520 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
17521 {
17522   x = unwrap_const_vec_duplicate (x);
17523   if (!CONST_INT_P (x))
17524     return false;
17525   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
17526   if (left)
17527     return IN_RANGE (INTVAL (x), 0, bit_width - 1);
17528   else
17529     return IN_RANGE (INTVAL (x), 1, bit_width);
17530 }
17531
17532 /* Return the bitmask CONST_INT to select the bits required by a zero extract
17533    operation of width WIDTH at bit position POS.  */
17534
17535 rtx
17536 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
17537 {
17538   gcc_assert (CONST_INT_P (width));
17539   gcc_assert (CONST_INT_P (pos));
17540
17541   unsigned HOST_WIDE_INT mask
17542     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
17543   return GEN_INT (mask << UINTVAL (pos));
17544 }
17545
17546 bool
17547 aarch64_mov_operand_p (rtx x, machine_mode mode)
17548 {
17549   if (GET_CODE (x) == HIGH
17550       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
17551     return true;
17552
17553   if (CONST_INT_P (x))
17554     return true;
17555
17556   if (VECTOR_MODE_P (GET_MODE (x)))
17557     {
17558       /* Require predicate constants to be VNx16BI before RA, so that we
17559          force everything to have a canonical form.  */
17560       if (!lra_in_progress
17561           && !reload_completed
17562           && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
17563           && GET_MODE (x) != VNx16BImode)
17564         return false;
17565
17566       return aarch64_simd_valid_immediate (x, NULL);
17567     }
17568
17569   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
17570     return true;
17571
17572   if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x))
17573     return true;
17574
17575   return aarch64_classify_symbolic_expression (x)
17576     == SYMBOL_TINY_ABSOLUTE;
17577 }
17578
17579 /* Return a const_int vector of VAL.  */
17580 rtx
17581 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
17582 {
17583   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
17584   return gen_const_vec_duplicate (mode, c);
17585 }
17586
17587 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
17588
17589 bool
17590 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
17591 {
17592   machine_mode vmode;
17593
17594   vmode = aarch64_simd_container_mode (mode, 64);
17595   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
17596   return aarch64_simd_valid_immediate (op_v, NULL);
17597 }
17598
17599 /* Construct and return a PARALLEL RTX vector with elements numbering the
17600    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
17601    the vector - from the perspective of the architecture.  This does not
17602    line up with GCC's perspective on lane numbers, so we end up with
17603    different masks depending on our target endian-ness.  The diagram
17604    below may help.  We must draw the distinction when building masks
17605    which select one half of the vector.  An instruction selecting
17606    architectural low-lanes for a big-endian target, must be described using
17607    a mask selecting GCC high-lanes.
17608
17609                  Big-Endian             Little-Endian
17610
17611 GCC             0   1   2   3           3   2   1   0
17612               | x | x | x | x |       | x | x | x | x |
17613 Architecture    3   2   1   0           3   2   1   0
17614
17615 Low Mask:         { 2, 3 }                { 0, 1 }
17616 High Mask:        { 0, 1 }                { 2, 3 }
17617
17618    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
17619
17620 rtx
17621 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
17622 {
17623   rtvec v = rtvec_alloc (nunits / 2);
17624   int high_base = nunits / 2;
17625   int low_base = 0;
17626   int base;
17627   rtx t1;
17628   int i;
17629
17630   if (BYTES_BIG_ENDIAN)
17631     base = high ? low_base : high_base;
17632   else
17633     base = high ? high_base : low_base;
17634
17635   for (i = 0; i < nunits / 2; i++)
17636     RTVEC_ELT (v, i) = GEN_INT (base + i);
17637
17638   t1 = gen_rtx_PARALLEL (mode, v);
17639   return t1;
17640 }
17641
17642 /* Check OP for validity as a PARALLEL RTX vector with elements
17643    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
17644    from the perspective of the architecture.  See the diagram above
17645    aarch64_simd_vect_par_cnst_half for more details.  */
17646
17647 bool
17648 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
17649                                        bool high)
17650 {
17651   int nelts;
17652   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
17653     return false;
17654
17655   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
17656   HOST_WIDE_INT count_op = XVECLEN (op, 0);
17657   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
17658   int i = 0;
17659
17660   if (count_op != count_ideal)
17661     return false;
17662
17663   for (i = 0; i < count_ideal; i++)
17664     {
17665       rtx elt_op = XVECEXP (op, 0, i);
17666       rtx elt_ideal = XVECEXP (ideal, 0, i);
17667
17668       if (!CONST_INT_P (elt_op)
17669           || INTVAL (elt_ideal) != INTVAL (elt_op))
17670         return false;
17671     }
17672   return true;
17673 }
17674
17675 /* Return a PARALLEL containing NELTS elements, with element I equal
17676    to BASE + I * STEP.  */
17677
17678 rtx
17679 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
17680 {
17681   rtvec vec = rtvec_alloc (nelts);
17682   for (unsigned int i = 0; i < nelts; ++i)
17683     RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
17684   return gen_rtx_PARALLEL (VOIDmode, vec);
17685 }
17686
17687 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
17688    series with step STEP.  */
17689
17690 bool
17691 aarch64_stepped_int_parallel_p (rtx op, int step)
17692 {
17693   if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
17694     return false;
17695
17696   unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
17697   for (int i = 1; i < XVECLEN (op, 0); ++i)
17698     if (!CONST_INT_P (XVECEXP (op, 0, i))
17699         || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
17700       return false;
17701
17702   return true;
17703 }
17704
17705 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
17706    HIGH (exclusive).  */
17707 void
17708 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
17709                           const_tree exp)
17710 {
17711   HOST_WIDE_INT lane;
17712   gcc_assert (CONST_INT_P (operand));
17713   lane = INTVAL (operand);
17714
17715   if (lane < low || lane >= high)
17716   {
17717     if (exp)
17718       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
17719     else
17720       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
17721   }
17722 }
17723
17724 /* Peform endian correction on lane number N, which indexes a vector
17725    of mode MODE, and return the result as an SImode rtx.  */
17726
17727 rtx
17728 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
17729 {
17730   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
17731 }
17732
17733 /* Return TRUE if OP is a valid vector addressing mode.  */
17734
17735 bool
17736 aarch64_simd_mem_operand_p (rtx op)
17737 {
17738   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
17739                         || REG_P (XEXP (op, 0)));
17740 }
17741
17742 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
17743
17744 bool
17745 aarch64_sve_ld1r_operand_p (rtx op)
17746 {
17747   struct aarch64_address_info addr;
17748   scalar_mode mode;
17749
17750   return (MEM_P (op)
17751           && is_a <scalar_mode> (GET_MODE (op), &mode)
17752           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
17753           && addr.type == ADDRESS_REG_IMM
17754           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
17755 }
17756
17757 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
17758    where the size of the read data is specified by `mode` and the size of the
17759    vector elements are specified by `elem_mode`.   */
17760 bool
17761 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
17762                                    scalar_mode elem_mode)
17763 {
17764   struct aarch64_address_info addr;
17765   if (!MEM_P (op)
17766       || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
17767     return false;
17768
17769   if (addr.type == ADDRESS_REG_IMM)
17770     return offset_4bit_signed_scaled_p (mode, addr.const_offset);
17771
17772   if (addr.type == ADDRESS_REG_REG)
17773     return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
17774
17775   return false;
17776 }
17777
17778 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
17779 bool
17780 aarch64_sve_ld1rq_operand_p (rtx op)
17781 {
17782   return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
17783                                             GET_MODE_INNER (GET_MODE (op)));
17784 }
17785
17786 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
17787    accessing a vector where the element size is specified by `elem_mode`.  */
17788 bool
17789 aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
17790 {
17791   return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
17792 }
17793
17794 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction.  */
17795 bool
17796 aarch64_sve_ldff1_operand_p (rtx op)
17797 {
17798   if (!MEM_P (op))
17799     return false;
17800
17801   struct aarch64_address_info addr;
17802   if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
17803     return false;
17804
17805   if (addr.type == ADDRESS_REG_IMM)
17806     return known_eq (addr.const_offset, 0);
17807
17808   return addr.type == ADDRESS_REG_REG;
17809 }
17810
17811 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction.  */
17812 bool
17813 aarch64_sve_ldnf1_operand_p (rtx op)
17814 {
17815   struct aarch64_address_info addr;
17816
17817   return (MEM_P (op)
17818           && aarch64_classify_address (&addr, XEXP (op, 0),
17819                                        GET_MODE (op), false)
17820           && addr.type == ADDRESS_REG_IMM);
17821 }
17822
17823 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
17824    The conditions for STR are the same.  */
17825 bool
17826 aarch64_sve_ldr_operand_p (rtx op)
17827 {
17828   struct aarch64_address_info addr;
17829
17830   return (MEM_P (op)
17831           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
17832                                        false, ADDR_QUERY_ANY)
17833           && addr.type == ADDRESS_REG_IMM);
17834 }
17835
17836 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
17837    addressing memory of mode MODE.  */
17838 bool
17839 aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
17840 {
17841   struct aarch64_address_info addr;
17842   if (!aarch64_classify_address (&addr, op, mode, false))
17843     return false;
17844
17845   if (addr.type == ADDRESS_REG_IMM)
17846     return known_eq (addr.const_offset, 0);
17847
17848   return addr.type == ADDRESS_REG_REG;
17849 }
17850
17851 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
17852    We need to be able to access the individual pieces, so the range
17853    is different from LD[234] and ST[234].  */
17854 bool
17855 aarch64_sve_struct_memory_operand_p (rtx op)
17856 {
17857   if (!MEM_P (op))
17858     return false;
17859
17860   machine_mode mode = GET_MODE (op);
17861   struct aarch64_address_info addr;
17862   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
17863                                  ADDR_QUERY_ANY)
17864       || addr.type != ADDRESS_REG_IMM)
17865     return false;
17866
17867   poly_int64 first = addr.const_offset;
17868   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
17869   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
17870           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
17871 }
17872
17873 /* Emit a register copy from operand to operand, taking care not to
17874    early-clobber source registers in the process.
17875
17876    COUNT is the number of components into which the copy needs to be
17877    decomposed.  */
17878 void
17879 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
17880                                 unsigned int count)
17881 {
17882   unsigned int i;
17883   int rdest = REGNO (operands[0]);
17884   int rsrc = REGNO (operands[1]);
17885
17886   if (!reg_overlap_mentioned_p (operands[0], operands[1])
17887       || rdest < rsrc)
17888     for (i = 0; i < count; i++)
17889       emit_move_insn (gen_rtx_REG (mode, rdest + i),
17890                       gen_rtx_REG (mode, rsrc + i));
17891   else
17892     for (i = 0; i < count; i++)
17893       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
17894                       gen_rtx_REG (mode, rsrc + count - i - 1));
17895 }
17896
17897 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
17898    one of VSTRUCT modes: OI, CI, or XI.  */
17899 int
17900 aarch64_simd_attr_length_rglist (machine_mode mode)
17901 {
17902   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
17903   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
17904 }
17905
17906 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
17907    alignment of a vector to 128 bits.  SVE predicates have an alignment of
17908    16 bits.  */
17909 static HOST_WIDE_INT
17910 aarch64_simd_vector_alignment (const_tree type)
17911 {
17912   /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
17913      be set for non-predicate vectors of booleans.  Modes are the most
17914      direct way we have of identifying real SVE predicate types.  */
17915   if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
17916     return 16;
17917   widest_int min_size
17918     = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
17919   return wi::umin (min_size, 128).to_uhwi ();
17920 }
17921
17922 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
17923 static poly_uint64
17924 aarch64_vectorize_preferred_vector_alignment (const_tree type)
17925 {
17926   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
17927     {
17928       /* If the length of the vector is fixed, try to align to that length,
17929          otherwise don't try to align at all.  */
17930       HOST_WIDE_INT result;
17931       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
17932         result = TYPE_ALIGN (TREE_TYPE (type));
17933       return result;
17934     }
17935   return TYPE_ALIGN (type);
17936 }
17937
17938 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
17939 static bool
17940 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
17941 {
17942   if (is_packed)
17943     return false;
17944
17945   /* For fixed-length vectors, check that the vectorizer will aim for
17946      full-vector alignment.  This isn't true for generic GCC vectors
17947      that are wider than the ABI maximum of 128 bits.  */
17948   poly_uint64 preferred_alignment =
17949     aarch64_vectorize_preferred_vector_alignment (type);
17950   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
17951       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
17952                    preferred_alignment))
17953     return false;
17954
17955   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
17956   return true;
17957 }
17958
17959 /* Return true if the vector misalignment factor is supported by the
17960    target.  */
17961 static bool
17962 aarch64_builtin_support_vector_misalignment (machine_mode mode,
17963                                              const_tree type, int misalignment,
17964                                              bool is_packed)
17965 {
17966   if (TARGET_SIMD && STRICT_ALIGNMENT)
17967     {
17968       /* Return if movmisalign pattern is not supported for this mode.  */
17969       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
17970         return false;
17971
17972       /* Misalignment factor is unknown at compile time.  */
17973       if (misalignment == -1)
17974         return false;
17975     }
17976   return default_builtin_support_vector_misalignment (mode, type, misalignment,
17977                                                       is_packed);
17978 }
17979
17980 /* If VALS is a vector constant that can be loaded into a register
17981    using DUP, generate instructions to do so and return an RTX to
17982    assign to the register.  Otherwise return NULL_RTX.  */
17983 static rtx
17984 aarch64_simd_dup_constant (rtx vals)
17985 {
17986   machine_mode mode = GET_MODE (vals);
17987   machine_mode inner_mode = GET_MODE_INNER (mode);
17988   rtx x;
17989
17990   if (!const_vec_duplicate_p (vals, &x))
17991     return NULL_RTX;
17992
17993   /* We can load this constant by using DUP and a constant in a
17994      single ARM register.  This will be cheaper than a vector
17995      load.  */
17996   x = copy_to_mode_reg (inner_mode, x);
17997   return gen_vec_duplicate (mode, x);
17998 }
17999
18000
18001 /* Generate code to load VALS, which is a PARALLEL containing only
18002    constants (for vec_init) or CONST_VECTOR, efficiently into a
18003    register.  Returns an RTX to copy into the register, or NULL_RTX
18004    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
18005 static rtx
18006 aarch64_simd_make_constant (rtx vals)
18007 {
18008   machine_mode mode = GET_MODE (vals);
18009   rtx const_dup;
18010   rtx const_vec = NULL_RTX;
18011   int n_const = 0;
18012   int i;
18013
18014   if (GET_CODE (vals) == CONST_VECTOR)
18015     const_vec = vals;
18016   else if (GET_CODE (vals) == PARALLEL)
18017     {
18018       /* A CONST_VECTOR must contain only CONST_INTs and
18019          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
18020          Only store valid constants in a CONST_VECTOR.  */
18021       int n_elts = XVECLEN (vals, 0);
18022       for (i = 0; i < n_elts; ++i)
18023         {
18024           rtx x = XVECEXP (vals, 0, i);
18025           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
18026             n_const++;
18027         }
18028       if (n_const == n_elts)
18029         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
18030     }
18031   else
18032     gcc_unreachable ();
18033
18034   if (const_vec != NULL_RTX
18035       && aarch64_simd_valid_immediate (const_vec, NULL))
18036     /* Load using MOVI/MVNI.  */
18037     return const_vec;
18038   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
18039     /* Loaded using DUP.  */
18040     return const_dup;
18041   else if (const_vec != NULL_RTX)
18042     /* Load from constant pool. We cannot take advantage of single-cycle
18043        LD1 because we need a PC-relative addressing mode.  */
18044     return const_vec;
18045   else
18046     /* A PARALLEL containing something not valid inside CONST_VECTOR.
18047        We cannot construct an initializer.  */
18048     return NULL_RTX;
18049 }
18050
18051 /* Expand a vector initialisation sequence, such that TARGET is
18052    initialised to contain VALS.  */
18053
18054 void
18055 aarch64_expand_vector_init (rtx target, rtx vals)
18056 {
18057   machine_mode mode = GET_MODE (target);
18058   scalar_mode inner_mode = GET_MODE_INNER (mode);
18059   /* The number of vector elements.  */
18060   int n_elts = XVECLEN (vals, 0);
18061   /* The number of vector elements which are not constant.  */
18062   int n_var = 0;
18063   rtx any_const = NULL_RTX;
18064   /* The first element of vals.  */
18065   rtx v0 = XVECEXP (vals, 0, 0);
18066   bool all_same = true;
18067
18068   /* This is a special vec_init<M><N> where N is not an element mode but a
18069      vector mode with half the elements of M.  We expect to find two entries
18070      of mode N in VALS and we must put their concatentation into TARGET.  */
18071   if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
18072     {
18073       gcc_assert (known_eq (GET_MODE_SIZE (mode),
18074                   2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
18075       rtx lo = XVECEXP (vals, 0, 0);
18076       rtx hi = XVECEXP (vals, 0, 1);
18077       machine_mode narrow_mode = GET_MODE (lo);
18078       gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
18079       gcc_assert (narrow_mode == GET_MODE (hi));
18080
18081       /* When we want to concatenate a half-width vector with zeroes we can
18082          use the aarch64_combinez[_be] patterns.  Just make sure that the
18083          zeroes are in the right half.  */
18084       if (BYTES_BIG_ENDIAN
18085           && aarch64_simd_imm_zero (lo, narrow_mode)
18086           && general_operand (hi, narrow_mode))
18087         emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
18088       else if (!BYTES_BIG_ENDIAN
18089                && aarch64_simd_imm_zero (hi, narrow_mode)
18090                && general_operand (lo, narrow_mode))
18091         emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
18092       else
18093         {
18094           /* Else create the two half-width registers and combine them.  */
18095           if (!REG_P (lo))
18096             lo = force_reg (GET_MODE (lo), lo);
18097           if (!REG_P (hi))
18098             hi = force_reg (GET_MODE (hi), hi);
18099
18100           if (BYTES_BIG_ENDIAN)
18101             std::swap (lo, hi);
18102           emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
18103         }
18104      return;
18105    }
18106
18107   /* Count the number of variable elements to initialise.  */
18108   for (int i = 0; i < n_elts; ++i)
18109     {
18110       rtx x = XVECEXP (vals, 0, i);
18111       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
18112         ++n_var;
18113       else
18114         any_const = x;
18115
18116       all_same &= rtx_equal_p (x, v0);
18117     }
18118
18119   /* No variable elements, hand off to aarch64_simd_make_constant which knows
18120      how best to handle this.  */
18121   if (n_var == 0)
18122     {
18123       rtx constant = aarch64_simd_make_constant (vals);
18124       if (constant != NULL_RTX)
18125         {
18126           emit_move_insn (target, constant);
18127           return;
18128         }
18129     }
18130
18131   /* Splat a single non-constant element if we can.  */
18132   if (all_same)
18133     {
18134       rtx x = copy_to_mode_reg (inner_mode, v0);
18135       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
18136       return;
18137     }
18138
18139   enum insn_code icode = optab_handler (vec_set_optab, mode);
18140   gcc_assert (icode != CODE_FOR_nothing);
18141
18142   /* If there are only variable elements, try to optimize
18143      the insertion using dup for the most common element
18144      followed by insertions.  */
18145
18146   /* The algorithm will fill matches[*][0] with the earliest matching element,
18147      and matches[X][1] with the count of duplicate elements (if X is the
18148      earliest element which has duplicates).  */
18149
18150   if (n_var == n_elts && n_elts <= 16)
18151     {
18152       int matches[16][2] = {0};
18153       for (int i = 0; i < n_elts; i++)
18154         {
18155           for (int j = 0; j <= i; j++)
18156             {
18157               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
18158                 {
18159                   matches[i][0] = j;
18160                   matches[j][1]++;
18161                   break;
18162                 }
18163             }
18164         }
18165       int maxelement = 0;
18166       int maxv = 0;
18167       for (int i = 0; i < n_elts; i++)
18168         if (matches[i][1] > maxv)
18169           {
18170             maxelement = i;
18171             maxv = matches[i][1];
18172           }
18173
18174       /* Create a duplicate of the most common element, unless all elements
18175          are equally useless to us, in which case just immediately set the
18176          vector register using the first element.  */
18177
18178       if (maxv == 1)
18179         {
18180           /* For vectors of two 64-bit elements, we can do even better.  */
18181           if (n_elts == 2
18182               && (inner_mode == E_DImode
18183                   || inner_mode == E_DFmode))
18184
18185             {
18186               rtx x0 = XVECEXP (vals, 0, 0);
18187               rtx x1 = XVECEXP (vals, 0, 1);
18188               /* Combine can pick up this case, but handling it directly
18189                  here leaves clearer RTL.
18190
18191                  This is load_pair_lanes<mode>, and also gives us a clean-up
18192                  for store_pair_lanes<mode>.  */
18193               if (memory_operand (x0, inner_mode)
18194                   && memory_operand (x1, inner_mode)
18195                   && !STRICT_ALIGNMENT
18196                   && rtx_equal_p (XEXP (x1, 0),
18197                                   plus_constant (Pmode,
18198                                                  XEXP (x0, 0),
18199                                                  GET_MODE_SIZE (inner_mode))))
18200                 {
18201                   rtx t;
18202                   if (inner_mode == DFmode)
18203                     t = gen_load_pair_lanesdf (target, x0, x1);
18204                   else
18205                     t = gen_load_pair_lanesdi (target, x0, x1);
18206                   emit_insn (t);
18207                   return;
18208                 }
18209             }
18210           /* The subreg-move sequence below will move into lane zero of the
18211              vector register.  For big-endian we want that position to hold
18212              the last element of VALS.  */
18213           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
18214           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
18215           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
18216         }
18217       else
18218         {
18219           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
18220           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
18221         }
18222
18223       /* Insert the rest.  */
18224       for (int i = 0; i < n_elts; i++)
18225         {
18226           rtx x = XVECEXP (vals, 0, i);
18227           if (matches[i][0] == maxelement)
18228             continue;
18229           x = copy_to_mode_reg (inner_mode, x);
18230           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
18231         }
18232       return;
18233     }
18234
18235   /* Initialise a vector which is part-variable.  We want to first try
18236      to build those lanes which are constant in the most efficient way we
18237      can.  */
18238   if (n_var != n_elts)
18239     {
18240       rtx copy = copy_rtx (vals);
18241
18242       /* Load constant part of vector.  We really don't care what goes into the
18243          parts we will overwrite, but we're more likely to be able to load the
18244          constant efficiently if it has fewer, larger, repeating parts
18245          (see aarch64_simd_valid_immediate).  */
18246       for (int i = 0; i < n_elts; i++)
18247         {
18248           rtx x = XVECEXP (vals, 0, i);
18249           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
18250             continue;
18251           rtx subst = any_const;
18252           for (int bit = n_elts / 2; bit > 0; bit /= 2)
18253             {
18254               /* Look in the copied vector, as more elements are const.  */
18255               rtx test = XVECEXP (copy, 0, i ^ bit);
18256               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
18257                 {
18258                   subst = test;
18259                   break;
18260                 }
18261             }
18262           XVECEXP (copy, 0, i) = subst;
18263         }
18264       aarch64_expand_vector_init (target, copy);
18265     }
18266
18267   /* Insert the variable lanes directly.  */
18268   for (int i = 0; i < n_elts; i++)
18269     {
18270       rtx x = XVECEXP (vals, 0, i);
18271       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
18272         continue;
18273       x = copy_to_mode_reg (inner_mode, x);
18274       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
18275     }
18276 }
18277
18278 /* Emit RTL corresponding to:
18279    insr TARGET, ELEM.  */
18280
18281 static void
18282 emit_insr (rtx target, rtx elem)
18283 {
18284   machine_mode mode = GET_MODE (target);
18285   scalar_mode elem_mode = GET_MODE_INNER (mode);
18286   elem = force_reg (elem_mode, elem);
18287
18288   insn_code icode = optab_handler (vec_shl_insert_optab, mode);
18289   gcc_assert (icode != CODE_FOR_nothing);
18290   emit_insn (GEN_FCN (icode) (target, target, elem));
18291 }
18292
18293 /* Subroutine of aarch64_sve_expand_vector_init for handling
18294    trailing constants.
18295    This function works as follows:
18296    (a) Create a new vector consisting of trailing constants.
18297    (b) Initialize TARGET with the constant vector using emit_move_insn.
18298    (c) Insert remaining elements in TARGET using insr.
18299    NELTS is the total number of elements in original vector while
18300    while NELTS_REQD is the number of elements that are actually
18301    significant.
18302
18303    ??? The heuristic used is to do above only if number of constants
18304    is at least half the total number of elements.  May need fine tuning.  */
18305
18306 static bool
18307 aarch64_sve_expand_vector_init_handle_trailing_constants
18308  (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
18309 {
18310   machine_mode mode = GET_MODE (target);
18311   scalar_mode elem_mode = GET_MODE_INNER (mode);
18312   int n_trailing_constants = 0;
18313
18314   for (int i = nelts_reqd - 1;
18315        i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
18316        i--)
18317     n_trailing_constants++;
18318
18319   if (n_trailing_constants >= nelts_reqd / 2)
18320     {
18321       rtx_vector_builder v (mode, 1, nelts);
18322       for (int i = 0; i < nelts; i++)
18323         v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
18324       rtx const_vec = v.build ();
18325       emit_move_insn (target, const_vec);
18326
18327       for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
18328         emit_insr (target, builder.elt (i));
18329
18330       return true;
18331     }
18332
18333   return false;
18334 }
18335
18336 /* Subroutine of aarch64_sve_expand_vector_init.
18337    Works as follows:
18338    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
18339    (b) Skip trailing elements from BUILDER, which are the same as
18340        element NELTS_REQD - 1.
18341    (c) Insert earlier elements in reverse order in TARGET using insr.  */
18342
18343 static void
18344 aarch64_sve_expand_vector_init_insert_elems (rtx target,
18345                                              const rtx_vector_builder &builder,
18346                                              int nelts_reqd)
18347 {
18348   machine_mode mode = GET_MODE (target);
18349   scalar_mode elem_mode = GET_MODE_INNER (mode);
18350
18351   struct expand_operand ops[2];
18352   enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
18353   gcc_assert (icode != CODE_FOR_nothing);
18354
18355   create_output_operand (&ops[0], target, mode);
18356   create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
18357   expand_insn (icode, 2, ops);
18358
18359   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
18360   for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
18361     emit_insr (target, builder.elt (i));
18362 }
18363
18364 /* Subroutine of aarch64_sve_expand_vector_init to handle case
18365    when all trailing elements of builder are same.
18366    This works as follows:
18367    (a) Use expand_insn interface to broadcast last vector element in TARGET.
18368    (b) Insert remaining elements in TARGET using insr.
18369
18370    ??? The heuristic used is to do above if number of same trailing elements
18371    is at least 3/4 of total number of elements, loosely based on
18372    heuristic from mostly_zeros_p.  May need fine-tuning.  */
18373
18374 static bool
18375 aarch64_sve_expand_vector_init_handle_trailing_same_elem
18376  (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
18377 {
18378   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
18379   if (ndups >= (3 * nelts_reqd) / 4)
18380     {
18381       aarch64_sve_expand_vector_init_insert_elems (target, builder,
18382                                                    nelts_reqd - ndups + 1);
18383       return true;
18384     }
18385
18386   return false;
18387 }
18388
18389 /* Initialize register TARGET from BUILDER. NELTS is the constant number
18390    of elements in BUILDER.
18391
18392    The function tries to initialize TARGET from BUILDER if it fits one
18393    of the special cases outlined below.
18394
18395    Failing that, the function divides BUILDER into two sub-vectors:
18396    v_even = even elements of BUILDER;
18397    v_odd = odd elements of BUILDER;
18398
18399    and recursively calls itself with v_even and v_odd.
18400
18401    if (recursive call succeeded for v_even or v_odd)
18402      TARGET = zip (v_even, v_odd)
18403
18404    The function returns true if it managed to build TARGET from BUILDER
18405    with one of the special cases, false otherwise.
18406
18407    Example: {a, 1, b, 2, c, 3, d, 4}
18408
18409    The vector gets divided into:
18410    v_even = {a, b, c, d}
18411    v_odd = {1, 2, 3, 4}
18412
18413    aarch64_sve_expand_vector_init(v_odd) hits case 1 and
18414    initialize tmp2 from constant vector v_odd using emit_move_insn.
18415
18416    aarch64_sve_expand_vector_init(v_even) fails since v_even contains
18417    4 elements, so we construct tmp1 from v_even using insr:
18418    tmp1 = dup(d)
18419    insr tmp1, c
18420    insr tmp1, b
18421    insr tmp1, a
18422
18423    And finally:
18424    TARGET = zip (tmp1, tmp2)
18425    which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
18426
18427 static bool
18428 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
18429                                 int nelts, int nelts_reqd)
18430 {
18431   machine_mode mode = GET_MODE (target);
18432
18433   /* Case 1: Vector contains trailing constants.  */
18434
18435   if (aarch64_sve_expand_vector_init_handle_trailing_constants
18436        (target, builder, nelts, nelts_reqd))
18437     return true;
18438
18439   /* Case 2: Vector contains leading constants.  */
18440
18441   rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
18442   for (int i = 0; i < nelts_reqd; i++)
18443     rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
18444   rev_builder.finalize ();
18445
18446   if (aarch64_sve_expand_vector_init_handle_trailing_constants
18447        (target, rev_builder, nelts, nelts_reqd))
18448     {
18449       emit_insn (gen_aarch64_sve_rev (mode, target, target));
18450       return true;
18451     }
18452
18453   /* Case 3: Vector contains trailing same element.  */
18454
18455   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
18456        (target, builder, nelts_reqd))
18457     return true;
18458
18459   /* Case 4: Vector contains leading same element.  */
18460
18461   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
18462        (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
18463     {
18464       emit_insn (gen_aarch64_sve_rev (mode, target, target));
18465       return true;
18466     }
18467
18468   /* Avoid recursing below 4-elements.
18469      ??? The threshold 4 may need fine-tuning.  */
18470
18471   if (nelts_reqd <= 4)
18472     return false;
18473
18474   rtx_vector_builder v_even (mode, 1, nelts);
18475   rtx_vector_builder v_odd (mode, 1, nelts);
18476
18477   for (int i = 0; i < nelts * 2; i += 2)
18478     {
18479       v_even.quick_push (builder.elt (i));
18480       v_odd.quick_push (builder.elt (i + 1));
18481     }
18482
18483   v_even.finalize ();
18484   v_odd.finalize ();
18485
18486   rtx tmp1 = gen_reg_rtx (mode);
18487   bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
18488                                                     nelts, nelts_reqd / 2);
18489
18490   rtx tmp2 = gen_reg_rtx (mode);
18491   bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
18492                                                    nelts, nelts_reqd / 2);
18493
18494   if (!did_even_p && !did_odd_p)
18495     return false;
18496
18497   /* Initialize v_even and v_odd using INSR if it didn't match any of the
18498      special cases and zip v_even, v_odd.  */
18499
18500   if (!did_even_p)
18501     aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
18502
18503   if (!did_odd_p)
18504     aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
18505
18506   rtvec v = gen_rtvec (2, tmp1, tmp2);
18507   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
18508   return true;
18509 }
18510
18511 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
18512
18513 void
18514 aarch64_sve_expand_vector_init (rtx target, rtx vals)
18515 {
18516   machine_mode mode = GET_MODE (target);
18517   int nelts = XVECLEN (vals, 0);
18518
18519   rtx_vector_builder v (mode, 1, nelts);
18520   for (int i = 0; i < nelts; i++)
18521     v.quick_push (XVECEXP (vals, 0, i));
18522   v.finalize ();
18523
18524   /* If neither sub-vectors of v could be initialized specially,
18525      then use INSR to insert all elements from v into TARGET.
18526      ??? This might not be optimal for vectors with large
18527      initializers like 16-element or above.
18528      For nelts < 4, it probably isn't useful to handle specially.  */
18529
18530   if (nelts < 4
18531       || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
18532     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
18533 }
18534
18535 /* Check whether VALUE is a vector constant in which every element
18536    is either a power of 2 or a negated power of 2.  If so, return
18537    a constant vector of log2s, and flip CODE between PLUS and MINUS
18538    if VALUE contains negated powers of 2.  Return NULL_RTX otherwise.  */
18539
18540 static rtx
18541 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
18542 {
18543   if (GET_CODE (value) != CONST_VECTOR)
18544     return NULL_RTX;
18545
18546   rtx_vector_builder builder;
18547   if (!builder.new_unary_operation (GET_MODE (value), value, false))
18548     return NULL_RTX;
18549
18550   scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
18551   /* 1 if the result of the multiplication must be negated,
18552      0 if it mustn't, or -1 if we don't yet care.  */
18553   int negate = -1;
18554   unsigned int encoded_nelts = const_vector_encoded_nelts (value);
18555   for (unsigned int i = 0; i < encoded_nelts; ++i)
18556     {
18557       rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
18558       if (!CONST_SCALAR_INT_P (elt))
18559         return NULL_RTX;
18560       rtx_mode_t val (elt, int_mode);
18561       wide_int pow2 = wi::neg (val);
18562       if (val != pow2)
18563         {
18564           /* It matters whether we negate or not.  Make that choice,
18565              and make sure that it's consistent with previous elements.  */
18566           if (negate == !wi::neg_p (val))
18567             return NULL_RTX;
18568           negate = wi::neg_p (val);
18569           if (!negate)
18570             pow2 = val;
18571         }
18572       /* POW2 is now the value that we want to be a power of 2.  */
18573       int shift = wi::exact_log2 (pow2);
18574       if (shift < 0)
18575         return NULL_RTX;
18576       builder.quick_push (gen_int_mode (shift, int_mode));
18577     }
18578   if (negate == -1)
18579     /* PLUS and MINUS are equivalent; canonicalize on PLUS.  */
18580     code = PLUS;
18581   else if (negate == 1)
18582     code = code == PLUS ? MINUS : PLUS;
18583   return builder.build ();
18584 }
18585
18586 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
18587    CODE is PLUS for the former and MINUS for the latter.  OPERANDS is the
18588    operands array, in the same order as for fma_optab.  Return true if
18589    the function emitted all the necessary instructions, false if the caller
18590    should generate the pattern normally with the new OPERANDS array.  */
18591
18592 bool
18593 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
18594 {
18595   machine_mode mode = GET_MODE (operands[0]);
18596   if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
18597     {
18598       rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
18599                                   NULL_RTX, true, OPTAB_DIRECT);
18600       force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
18601                           operands[3], product, operands[0], true,
18602                           OPTAB_DIRECT);
18603       return true;
18604     }
18605   operands[2] = force_reg (mode, operands[2]);
18606   return false;
18607 }
18608
18609 /* Likewise, but for a conditional pattern.  */
18610
18611 bool
18612 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
18613 {
18614   machine_mode mode = GET_MODE (operands[0]);
18615   if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
18616     {
18617       rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
18618                                   NULL_RTX, true, OPTAB_DIRECT);
18619       emit_insn (gen_cond (code, mode, operands[0], operands[1],
18620                            operands[4], product, operands[5]));
18621       return true;
18622     }
18623   operands[3] = force_reg (mode, operands[3]);
18624   return false;
18625 }
18626
18627 static unsigned HOST_WIDE_INT
18628 aarch64_shift_truncation_mask (machine_mode mode)
18629 {
18630   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
18631     return 0;
18632   return GET_MODE_UNIT_BITSIZE (mode) - 1;
18633 }
18634
18635 /* Select a format to encode pointers in exception handling data.  */
18636 int
18637 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
18638 {
18639    int type;
18640    switch (aarch64_cmodel)
18641      {
18642      case AARCH64_CMODEL_TINY:
18643      case AARCH64_CMODEL_TINY_PIC:
18644      case AARCH64_CMODEL_SMALL:
18645      case AARCH64_CMODEL_SMALL_PIC:
18646      case AARCH64_CMODEL_SMALL_SPIC:
18647        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
18648           for everything.  */
18649        type = DW_EH_PE_sdata4;
18650        break;
18651      default:
18652        /* No assumptions here.  8-byte relocs required.  */
18653        type = DW_EH_PE_sdata8;
18654        break;
18655      }
18656    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
18657 }
18658
18659 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
18660
18661 static void
18662 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
18663 {
18664   if (TREE_CODE (decl) == FUNCTION_DECL)
18665     {
18666       arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
18667       if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
18668         {
18669           fprintf (stream, "\t.variant_pcs\t");
18670           assemble_name (stream, name);
18671           fprintf (stream, "\n");
18672         }
18673     }
18674 }
18675
18676 /* The last .arch and .tune assembly strings that we printed.  */
18677 static std::string aarch64_last_printed_arch_string;
18678 static std::string aarch64_last_printed_tune_string;
18679
18680 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
18681    by the function fndecl.  */
18682
18683 void
18684 aarch64_declare_function_name (FILE *stream, const char* name,
18685                                 tree fndecl)
18686 {
18687   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
18688
18689   struct cl_target_option *targ_options;
18690   if (target_parts)
18691     targ_options = TREE_TARGET_OPTION (target_parts);
18692   else
18693     targ_options = TREE_TARGET_OPTION (target_option_current_node);
18694   gcc_assert (targ_options);
18695
18696   const struct processor *this_arch
18697     = aarch64_get_arch (targ_options->x_explicit_arch);
18698
18699   uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
18700   std::string extension
18701     = aarch64_get_extension_string_for_isa_flags (isa_flags,
18702                                                   this_arch->flags);
18703   /* Only update the assembler .arch string if it is distinct from the last
18704      such string we printed.  */
18705   std::string to_print = this_arch->name + extension;
18706   if (to_print != aarch64_last_printed_arch_string)
18707     {
18708       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
18709       aarch64_last_printed_arch_string = to_print;
18710     }
18711
18712   /* Print the cpu name we're tuning for in the comments, might be
18713      useful to readers of the generated asm.  Do it only when it changes
18714      from function to function and verbose assembly is requested.  */
18715   const struct processor *this_tune
18716     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
18717
18718   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
18719     {
18720       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
18721                    this_tune->name);
18722       aarch64_last_printed_tune_string = this_tune->name;
18723     }
18724
18725   aarch64_asm_output_variant_pcs (stream, fndecl, name);
18726
18727   /* Don't forget the type directive for ELF.  */
18728   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
18729   ASM_OUTPUT_LABEL (stream, name);
18730
18731   cfun->machine->label_is_assembled = true;
18732 }
18733
18734 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY.  Check if the patch area is after
18735    the function label and emit a BTI if necessary.  */
18736
18737 void
18738 aarch64_print_patchable_function_entry (FILE *file,
18739                                         unsigned HOST_WIDE_INT patch_area_size,
18740                                         bool record_p)
18741 {
18742   if (cfun->machine->label_is_assembled
18743       && aarch64_bti_enabled ()
18744       && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
18745     {
18746       /* Remove the BTI that follows the patch area and insert a new BTI
18747          before the patch area right after the function label.  */
18748       rtx_insn *insn = next_real_nondebug_insn (get_insns ());
18749       if (insn
18750           && INSN_P (insn)
18751           && GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
18752           && XINT (PATTERN (insn), 1) == UNSPECV_BTI_C)
18753         delete_insn (insn);
18754       asm_fprintf (file, "\thint\t34 // bti c\n");
18755     }
18756
18757   default_print_patchable_function_entry (file, patch_area_size, record_p);
18758 }
18759
18760 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
18761
18762 void
18763 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
18764 {
18765   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
18766   const char *value = IDENTIFIER_POINTER (target);
18767   aarch64_asm_output_variant_pcs (stream, decl, name);
18768   ASM_OUTPUT_DEF (stream, name, value);
18769 }
18770
18771 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
18772    function symbol references.  */
18773
18774 void
18775 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
18776 {
18777   default_elf_asm_output_external (stream, decl, name);
18778   aarch64_asm_output_variant_pcs (stream, decl, name);
18779 }
18780
18781 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
18782    Used to output the .cfi_b_key_frame directive when signing the current
18783    function with the B key.  */
18784
18785 void
18786 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
18787 {
18788   if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
18789       && aarch64_ra_sign_key == AARCH64_KEY_B)
18790         asm_fprintf (f, "\t.cfi_b_key_frame\n");
18791 }
18792
18793 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
18794
18795 static void
18796 aarch64_start_file (void)
18797 {
18798   struct cl_target_option *default_options
18799     = TREE_TARGET_OPTION (target_option_default_node);
18800
18801   const struct processor *default_arch
18802     = aarch64_get_arch (default_options->x_explicit_arch);
18803   uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
18804   std::string extension
18805     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
18806                                                   default_arch->flags);
18807
18808    aarch64_last_printed_arch_string = default_arch->name + extension;
18809    aarch64_last_printed_tune_string = "";
18810    asm_fprintf (asm_out_file, "\t.arch %s\n",
18811                 aarch64_last_printed_arch_string.c_str ());
18812
18813    default_file_start ();
18814 }
18815
18816 /* Emit load exclusive.  */
18817
18818 static void
18819 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
18820                              rtx mem, rtx model_rtx)
18821 {
18822   if (mode == TImode)
18823     emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
18824                                                 gen_highpart (DImode, rval),
18825                                                 mem, model_rtx));
18826   else
18827     emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
18828 }
18829
18830 /* Emit store exclusive.  */
18831
18832 static void
18833 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
18834                               rtx mem, rtx rval, rtx model_rtx)
18835 {
18836   if (mode == TImode)
18837     emit_insn (gen_aarch64_store_exclusive_pair
18838                (bval, mem, operand_subword (rval, 0, 0, TImode),
18839                 operand_subword (rval, 1, 0, TImode), model_rtx));
18840   else
18841     emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
18842 }
18843
18844 /* Mark the previous jump instruction as unlikely.  */
18845
18846 static void
18847 aarch64_emit_unlikely_jump (rtx insn)
18848 {
18849   rtx_insn *jump = emit_jump_insn (insn);
18850   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
18851 }
18852
18853 /* We store the names of the various atomic helpers in a 5x4 array.
18854    Return the libcall function given MODE, MODEL and NAMES.  */
18855
18856 rtx
18857 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
18858                         const atomic_ool_names *names)
18859 {
18860   memmodel model = memmodel_base (INTVAL (model_rtx));
18861   int mode_idx, model_idx;
18862
18863   switch (mode)
18864     {
18865     case E_QImode:
18866       mode_idx = 0;
18867       break;
18868     case E_HImode:
18869       mode_idx = 1;
18870       break;
18871     case E_SImode:
18872       mode_idx = 2;
18873       break;
18874     case E_DImode:
18875       mode_idx = 3;
18876       break;
18877     case E_TImode:
18878       mode_idx = 4;
18879       break;
18880     default:
18881       gcc_unreachable ();
18882     }
18883
18884   switch (model)
18885     {
18886     case MEMMODEL_RELAXED:
18887       model_idx = 0;
18888       break;
18889     case MEMMODEL_CONSUME:
18890     case MEMMODEL_ACQUIRE:
18891       model_idx = 1;
18892       break;
18893     case MEMMODEL_RELEASE:
18894       model_idx = 2;
18895       break;
18896     case MEMMODEL_ACQ_REL:
18897     case MEMMODEL_SEQ_CST:
18898       model_idx = 3;
18899       break;
18900     default:
18901       gcc_unreachable ();
18902     }
18903
18904   return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
18905                                       VISIBILITY_HIDDEN);
18906 }
18907
18908 #define DEF0(B, N) \
18909   { "__aarch64_" #B #N "_relax", \
18910     "__aarch64_" #B #N "_acq", \
18911     "__aarch64_" #B #N "_rel", \
18912     "__aarch64_" #B #N "_acq_rel" }
18913
18914 #define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
18915                  { NULL, NULL, NULL, NULL }
18916 #define DEF5(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
18917
18918 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
18919 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
18920 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
18921 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
18922 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
18923 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
18924
18925 #undef DEF0
18926 #undef DEF4
18927 #undef DEF5
18928
18929 /* Expand a compare and swap pattern.  */
18930
18931 void
18932 aarch64_expand_compare_and_swap (rtx operands[])
18933 {
18934   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
18935   machine_mode mode, r_mode;
18936
18937   bval = operands[0];
18938   rval = operands[1];
18939   mem = operands[2];
18940   oldval = operands[3];
18941   newval = operands[4];
18942   is_weak = operands[5];
18943   mod_s = operands[6];
18944   mod_f = operands[7];
18945   mode = GET_MODE (mem);
18946
18947   /* Normally the succ memory model must be stronger than fail, but in the
18948      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
18949      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
18950   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
18951       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
18952     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
18953
18954   r_mode = mode;
18955   if (mode == QImode || mode == HImode)
18956     {
18957       r_mode = SImode;
18958       rval = gen_reg_rtx (r_mode);
18959     }
18960
18961   if (TARGET_LSE)
18962     {
18963       /* The CAS insn requires oldval and rval overlap, but we need to
18964          have a copy of oldval saved across the operation to tell if
18965          the operation is successful.  */
18966       if (reg_overlap_mentioned_p (rval, oldval))
18967         rval = copy_to_mode_reg (r_mode, oldval);
18968       else
18969         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
18970
18971       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
18972                                                    newval, mod_s));
18973       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
18974     }
18975   else if (TARGET_OUTLINE_ATOMICS)
18976     {
18977       /* Oldval must satisfy compare afterward.  */
18978       if (!aarch64_plus_operand (oldval, mode))
18979         oldval = force_reg (mode, oldval);
18980       rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
18981       rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
18982                                       oldval, mode, newval, mode,
18983                                       XEXP (mem, 0), Pmode);
18984       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
18985     }
18986   else
18987     {
18988       /* The oldval predicate varies by mode.  Test it and force to reg.  */
18989       insn_code code = code_for_aarch64_compare_and_swap (mode);
18990       if (!insn_data[code].operand[2].predicate (oldval, mode))
18991         oldval = force_reg (mode, oldval);
18992
18993       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
18994                                  is_weak, mod_s, mod_f));
18995       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
18996     }
18997
18998   if (r_mode != mode)
18999     rval = gen_lowpart (mode, rval);
19000   emit_move_insn (operands[1], rval);
19001
19002   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
19003   emit_insn (gen_rtx_SET (bval, x));
19004 }
19005
19006 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
19007    sequence implementing an atomic operation.  */
19008
19009 static void
19010 aarch64_emit_post_barrier (enum memmodel model)
19011 {
19012   const enum memmodel base_model = memmodel_base (model);
19013
19014   if (is_mm_sync (model)
19015       && (base_model == MEMMODEL_ACQUIRE
19016           || base_model == MEMMODEL_ACQ_REL
19017           || base_model == MEMMODEL_SEQ_CST))
19018     {
19019       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
19020     }
19021 }
19022
19023 /* Split a compare and swap pattern.  */
19024
19025 void
19026 aarch64_split_compare_and_swap (rtx operands[])
19027 {
19028   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
19029   gcc_assert (epilogue_completed);
19030
19031   rtx rval, mem, oldval, newval, scratch, x, model_rtx;
19032   machine_mode mode;
19033   bool is_weak;
19034   rtx_code_label *label1, *label2;
19035   enum memmodel model;
19036
19037   rval = operands[0];
19038   mem = operands[1];
19039   oldval = operands[2];
19040   newval = operands[3];
19041   is_weak = (operands[4] != const0_rtx);
19042   model_rtx = operands[5];
19043   scratch = operands[7];
19044   mode = GET_MODE (mem);
19045   model = memmodel_from_int (INTVAL (model_rtx));
19046
19047   /* When OLDVAL is zero and we want the strong version we can emit a tighter
19048     loop:
19049     .label1:
19050         LD[A]XR rval, [mem]
19051         CBNZ    rval, .label2
19052         ST[L]XR scratch, newval, [mem]
19053         CBNZ    scratch, .label1
19054     .label2:
19055         CMP     rval, 0.  */
19056   bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
19057                         oldval == const0_rtx && mode != TImode);
19058
19059   label1 = NULL;
19060   if (!is_weak)
19061     {
19062       label1 = gen_label_rtx ();
19063       emit_label (label1);
19064     }
19065   label2 = gen_label_rtx ();
19066
19067   /* The initial load can be relaxed for a __sync operation since a final
19068      barrier will be emitted to stop code hoisting.  */
19069   if (is_mm_sync (model))
19070     aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
19071   else
19072     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
19073
19074   if (strong_zero_p)
19075     x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
19076   else
19077     {
19078       rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
19079       x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
19080     }
19081   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
19082                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
19083   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
19084
19085   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
19086
19087   if (!is_weak)
19088     {
19089       if (aarch64_track_speculation)
19090         {
19091           /* Emit an explicit compare instruction, so that we can correctly
19092              track the condition codes.  */
19093           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
19094           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
19095         }
19096       else
19097         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
19098
19099       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
19100                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
19101       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
19102     }
19103   else
19104     aarch64_gen_compare_reg (NE, scratch, const0_rtx);
19105
19106   emit_label (label2);
19107
19108   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
19109      to set the condition flags.  If this is not used it will be removed by
19110      later passes.  */
19111   if (strong_zero_p)
19112     aarch64_gen_compare_reg (NE, rval, const0_rtx);
19113
19114   /* Emit any final barrier needed for a __sync operation.  */
19115   if (is_mm_sync (model))
19116     aarch64_emit_post_barrier (model);
19117 }
19118
19119 /* Split an atomic operation.  */
19120
19121 void
19122 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
19123                          rtx value, rtx model_rtx, rtx cond)
19124 {
19125   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
19126   gcc_assert (epilogue_completed);
19127
19128   machine_mode mode = GET_MODE (mem);
19129   machine_mode wmode = (mode == DImode ? DImode : SImode);
19130   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
19131   const bool is_sync = is_mm_sync (model);
19132   rtx_code_label *label;
19133   rtx x;
19134
19135   /* Split the atomic operation into a sequence.  */
19136   label = gen_label_rtx ();
19137   emit_label (label);
19138
19139   if (new_out)
19140     new_out = gen_lowpart (wmode, new_out);
19141   if (old_out)
19142     old_out = gen_lowpart (wmode, old_out);
19143   else
19144     old_out = new_out;
19145   value = simplify_gen_subreg (wmode, value, mode, 0);
19146
19147   /* The initial load can be relaxed for a __sync operation since a final
19148      barrier will be emitted to stop code hoisting.  */
19149  if (is_sync)
19150     aarch64_emit_load_exclusive (mode, old_out, mem,
19151                                  GEN_INT (MEMMODEL_RELAXED));
19152   else
19153     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
19154
19155   switch (code)
19156     {
19157     case SET:
19158       new_out = value;
19159       break;
19160
19161     case NOT:
19162       x = gen_rtx_AND (wmode, old_out, value);
19163       emit_insn (gen_rtx_SET (new_out, x));
19164       x = gen_rtx_NOT (wmode, new_out);
19165       emit_insn (gen_rtx_SET (new_out, x));
19166       break;
19167
19168     case MINUS:
19169       if (CONST_INT_P (value))
19170         {
19171           value = GEN_INT (-INTVAL (value));
19172           code = PLUS;
19173         }
19174       /* Fall through.  */
19175
19176     default:
19177       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
19178       emit_insn (gen_rtx_SET (new_out, x));
19179       break;
19180     }
19181
19182   aarch64_emit_store_exclusive (mode, cond, mem,
19183                                 gen_lowpart (mode, new_out), model_rtx);
19184
19185   if (aarch64_track_speculation)
19186     {
19187       /* Emit an explicit compare instruction, so that we can correctly
19188          track the condition codes.  */
19189       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
19190       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
19191     }
19192   else
19193     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
19194
19195   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
19196                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
19197   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
19198
19199   /* Emit any final barrier needed for a __sync operation.  */
19200   if (is_sync)
19201     aarch64_emit_post_barrier (model);
19202 }
19203
19204 static void
19205 aarch64_init_libfuncs (void)
19206 {
19207    /* Half-precision float operations.  The compiler handles all operations
19208      with NULL libfuncs by converting to SFmode.  */
19209
19210   /* Conversions.  */
19211   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
19212   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
19213
19214   /* Arithmetic.  */
19215   set_optab_libfunc (add_optab, HFmode, NULL);
19216   set_optab_libfunc (sdiv_optab, HFmode, NULL);
19217   set_optab_libfunc (smul_optab, HFmode, NULL);
19218   set_optab_libfunc (neg_optab, HFmode, NULL);
19219   set_optab_libfunc (sub_optab, HFmode, NULL);
19220
19221   /* Comparisons.  */
19222   set_optab_libfunc (eq_optab, HFmode, NULL);
19223   set_optab_libfunc (ne_optab, HFmode, NULL);
19224   set_optab_libfunc (lt_optab, HFmode, NULL);
19225   set_optab_libfunc (le_optab, HFmode, NULL);
19226   set_optab_libfunc (ge_optab, HFmode, NULL);
19227   set_optab_libfunc (gt_optab, HFmode, NULL);
19228   set_optab_libfunc (unord_optab, HFmode, NULL);
19229 }
19230
19231 /* Target hook for c_mode_for_suffix.  */
19232 static machine_mode
19233 aarch64_c_mode_for_suffix (char suffix)
19234 {
19235   if (suffix == 'q')
19236     return TFmode;
19237
19238   return VOIDmode;
19239 }
19240
19241 /* We can only represent floating point constants which will fit in
19242    "quarter-precision" values.  These values are characterised by
19243    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
19244    by:
19245
19246    (-1)^s * (n/16) * 2^r
19247
19248    Where:
19249      's' is the sign bit.
19250      'n' is an integer in the range 16 <= n <= 31.
19251      'r' is an integer in the range -3 <= r <= 4.  */
19252
19253 /* Return true iff X can be represented by a quarter-precision
19254    floating point immediate operand X.  Note, we cannot represent 0.0.  */
19255 bool
19256 aarch64_float_const_representable_p (rtx x)
19257 {
19258   /* This represents our current view of how many bits
19259      make up the mantissa.  */
19260   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
19261   int exponent;
19262   unsigned HOST_WIDE_INT mantissa, mask;
19263   REAL_VALUE_TYPE r, m;
19264   bool fail;
19265
19266   x = unwrap_const_vec_duplicate (x);
19267   if (!CONST_DOUBLE_P (x))
19268     return false;
19269
19270   if (GET_MODE (x) == VOIDmode
19271       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
19272     return false;
19273
19274   r = *CONST_DOUBLE_REAL_VALUE (x);
19275
19276   /* We cannot represent infinities, NaNs or +/-zero.  We won't
19277      know if we have +zero until we analyse the mantissa, but we
19278      can reject the other invalid values.  */
19279   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
19280       || REAL_VALUE_MINUS_ZERO (r))
19281     return false;
19282
19283   /* Extract exponent.  */
19284   r = real_value_abs (&r);
19285   exponent = REAL_EXP (&r);
19286
19287   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
19288      highest (sign) bit, with a fixed binary point at bit point_pos.
19289      m1 holds the low part of the mantissa, m2 the high part.
19290      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
19291      bits for the mantissa, this can fail (low bits will be lost).  */
19292   real_ldexp (&m, &r, point_pos - exponent);
19293   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
19294
19295   /* If the low part of the mantissa has bits set we cannot represent
19296      the value.  */
19297   if (w.ulow () != 0)
19298     return false;
19299   /* We have rejected the lower HOST_WIDE_INT, so update our
19300      understanding of how many bits lie in the mantissa and
19301      look only at the high HOST_WIDE_INT.  */
19302   mantissa = w.elt (1);
19303   point_pos -= HOST_BITS_PER_WIDE_INT;
19304
19305   /* We can only represent values with a mantissa of the form 1.xxxx.  */
19306   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
19307   if ((mantissa & mask) != 0)
19308     return false;
19309
19310   /* Having filtered unrepresentable values, we may now remove all
19311      but the highest 5 bits.  */
19312   mantissa >>= point_pos - 5;
19313
19314   /* We cannot represent the value 0.0, so reject it.  This is handled
19315      elsewhere.  */
19316   if (mantissa == 0)
19317     return false;
19318
19319   /* Then, as bit 4 is always set, we can mask it off, leaving
19320      the mantissa in the range [0, 15].  */
19321   mantissa &= ~(1 << 4);
19322   gcc_assert (mantissa <= 15);
19323
19324   /* GCC internally does not use IEEE754-like encoding (where normalized
19325      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
19326      Our mantissa values are shifted 4 places to the left relative to
19327      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
19328      by 5 places to correct for GCC's representation.  */
19329   exponent = 5 - exponent;
19330
19331   return (exponent >= 0 && exponent <= 7);
19332 }
19333
19334 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
19335    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
19336    output MOVI/MVNI, ORR or BIC immediate.  */
19337 char*
19338 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
19339                                    enum simd_immediate_check which)
19340 {
19341   bool is_valid;
19342   static char templ[40];
19343   const char *mnemonic;
19344   const char *shift_op;
19345   unsigned int lane_count = 0;
19346   char element_char;
19347
19348   struct simd_immediate_info info;
19349
19350   /* This will return true to show const_vector is legal for use as either
19351      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
19352      It will also update INFO to show how the immediate should be generated.
19353      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
19354   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
19355   gcc_assert (is_valid);
19356
19357   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
19358   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
19359
19360   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
19361     {
19362       gcc_assert (info.insn == simd_immediate_info::MOV
19363                   && info.u.mov.shift == 0);
19364       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
19365          move immediate path.  */
19366       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
19367         info.u.mov.value = GEN_INT (0);
19368       else
19369         {
19370           const unsigned int buf_size = 20;
19371           char float_buf[buf_size] = {'\0'};
19372           real_to_decimal_for_mode (float_buf,
19373                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
19374                                     buf_size, buf_size, 1, info.elt_mode);
19375
19376           if (lane_count == 1)
19377             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
19378           else
19379             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
19380                       lane_count, element_char, float_buf);
19381           return templ;
19382         }
19383     }
19384
19385   gcc_assert (CONST_INT_P (info.u.mov.value));
19386
19387   if (which == AARCH64_CHECK_MOV)
19388     {
19389       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
19390       shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
19391                   ? "msl" : "lsl");
19392       if (lane_count == 1)
19393         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
19394                   mnemonic, UINTVAL (info.u.mov.value));
19395       else if (info.u.mov.shift)
19396         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
19397                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
19398                   element_char, UINTVAL (info.u.mov.value), shift_op,
19399                   info.u.mov.shift);
19400       else
19401         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
19402                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
19403                   element_char, UINTVAL (info.u.mov.value));
19404     }
19405   else
19406     {
19407       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
19408       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
19409       if (info.u.mov.shift)
19410         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
19411                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
19412                   element_char, UINTVAL (info.u.mov.value), "lsl",
19413                   info.u.mov.shift);
19414       else
19415         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
19416                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
19417                   element_char, UINTVAL (info.u.mov.value));
19418     }
19419   return templ;
19420 }
19421
19422 char*
19423 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
19424 {
19425
19426   /* If a floating point number was passed and we desire to use it in an
19427      integer mode do the conversion to integer.  */
19428   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
19429     {
19430       unsigned HOST_WIDE_INT ival;
19431       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
19432           gcc_unreachable ();
19433       immediate = gen_int_mode (ival, mode);
19434     }
19435
19436   machine_mode vmode;
19437   /* use a 64 bit mode for everything except for DI/DF mode, where we use
19438      a 128 bit vector mode.  */
19439   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
19440
19441   vmode = aarch64_simd_container_mode (mode, width);
19442   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
19443   return aarch64_output_simd_mov_immediate (v_op, width);
19444 }
19445
19446 /* Return the output string to use for moving immediate CONST_VECTOR
19447    into an SVE register.  */
19448
19449 char *
19450 aarch64_output_sve_mov_immediate (rtx const_vector)
19451 {
19452   static char templ[40];
19453   struct simd_immediate_info info;
19454   char element_char;
19455
19456   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
19457   gcc_assert (is_valid);
19458
19459   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
19460
19461   machine_mode vec_mode = GET_MODE (const_vector);
19462   if (aarch64_sve_pred_mode_p (vec_mode))
19463     {
19464       static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
19465       if (info.insn == simd_immediate_info::MOV)
19466         {
19467           gcc_assert (info.u.mov.value == const0_rtx);
19468           snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
19469         }
19470       else
19471         {
19472           gcc_assert (info.insn == simd_immediate_info::PTRUE);
19473           unsigned int total_bytes;
19474           if (info.u.pattern == AARCH64_SV_ALL
19475               && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
19476             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
19477                       total_bytes / GET_MODE_SIZE (info.elt_mode));
19478           else
19479             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
19480                       svpattern_token (info.u.pattern));
19481         }
19482       return buf;
19483     }
19484
19485   if (info.insn == simd_immediate_info::INDEX)
19486     {
19487       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
19488                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
19489                 element_char, INTVAL (info.u.index.base),
19490                 INTVAL (info.u.index.step));
19491       return templ;
19492     }
19493
19494   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
19495     {
19496       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
19497         info.u.mov.value = GEN_INT (0);
19498       else
19499         {
19500           const int buf_size = 20;
19501           char float_buf[buf_size] = {};
19502           real_to_decimal_for_mode (float_buf,
19503                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
19504                                     buf_size, buf_size, 1, info.elt_mode);
19505
19506           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
19507                     element_char, float_buf);
19508           return templ;
19509         }
19510     }
19511
19512   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
19513             element_char, INTVAL (info.u.mov.value));
19514   return templ;
19515 }
19516
19517 /* Return the asm template for a PTRUES.  CONST_UNSPEC is the
19518    aarch64_sve_ptrue_svpattern_immediate that describes the predicate
19519    pattern.  */
19520
19521 char *
19522 aarch64_output_sve_ptrues (rtx const_unspec)
19523 {
19524   static char templ[40];
19525
19526   struct simd_immediate_info info;
19527   bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
19528   gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
19529
19530   char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
19531   snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
19532             svpattern_token (info.u.pattern));
19533   return templ;
19534 }
19535
19536 /* Split operands into moves from op[1] + op[2] into op[0].  */
19537
19538 void
19539 aarch64_split_combinev16qi (rtx operands[3])
19540 {
19541   unsigned int dest = REGNO (operands[0]);
19542   unsigned int src1 = REGNO (operands[1]);
19543   unsigned int src2 = REGNO (operands[2]);
19544   machine_mode halfmode = GET_MODE (operands[1]);
19545   unsigned int halfregs = REG_NREGS (operands[1]);
19546   rtx destlo, desthi;
19547
19548   gcc_assert (halfmode == V16QImode);
19549
19550   if (src1 == dest && src2 == dest + halfregs)
19551     {
19552       /* No-op move.  Can't split to nothing; emit something.  */
19553       emit_note (NOTE_INSN_DELETED);
19554       return;
19555     }
19556
19557   /* Preserve register attributes for variable tracking.  */
19558   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
19559   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
19560                                GET_MODE_SIZE (halfmode));
19561
19562   /* Special case of reversed high/low parts.  */
19563   if (reg_overlap_mentioned_p (operands[2], destlo)
19564       && reg_overlap_mentioned_p (operands[1], desthi))
19565     {
19566       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
19567       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
19568       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
19569     }
19570   else if (!reg_overlap_mentioned_p (operands[2], destlo))
19571     {
19572       /* Try to avoid unnecessary moves if part of the result
19573          is in the right place already.  */
19574       if (src1 != dest)
19575         emit_move_insn (destlo, operands[1]);
19576       if (src2 != dest + halfregs)
19577         emit_move_insn (desthi, operands[2]);
19578     }
19579   else
19580     {
19581       if (src2 != dest + halfregs)
19582         emit_move_insn (desthi, operands[2]);
19583       if (src1 != dest)
19584         emit_move_insn (destlo, operands[1]);
19585     }
19586 }
19587
19588 /* vec_perm support.  */
19589
19590 struct expand_vec_perm_d
19591 {
19592   rtx target, op0, op1;
19593   vec_perm_indices perm;
19594   machine_mode vmode;
19595   unsigned int vec_flags;
19596   bool one_vector_p;
19597   bool testing_p;
19598 };
19599
19600 /* Generate a variable permutation.  */
19601
19602 static void
19603 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
19604 {
19605   machine_mode vmode = GET_MODE (target);
19606   bool one_vector_p = rtx_equal_p (op0, op1);
19607
19608   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
19609   gcc_checking_assert (GET_MODE (op0) == vmode);
19610   gcc_checking_assert (GET_MODE (op1) == vmode);
19611   gcc_checking_assert (GET_MODE (sel) == vmode);
19612   gcc_checking_assert (TARGET_SIMD);
19613
19614   if (one_vector_p)
19615     {
19616       if (vmode == V8QImode)
19617         {
19618           /* Expand the argument to a V16QI mode by duplicating it.  */
19619           rtx pair = gen_reg_rtx (V16QImode);
19620           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
19621           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
19622         }
19623       else
19624         {
19625           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
19626         }
19627     }
19628   else
19629     {
19630       rtx pair;
19631
19632       if (vmode == V8QImode)
19633         {
19634           pair = gen_reg_rtx (V16QImode);
19635           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
19636           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
19637         }
19638       else
19639         {
19640           pair = gen_reg_rtx (OImode);
19641           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
19642           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
19643         }
19644     }
19645 }
19646
19647 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
19648    NELT is the number of elements in the vector.  */
19649
19650 void
19651 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
19652                          unsigned int nelt)
19653 {
19654   machine_mode vmode = GET_MODE (target);
19655   bool one_vector_p = rtx_equal_p (op0, op1);
19656   rtx mask;
19657
19658   /* The TBL instruction does not use a modulo index, so we must take care
19659      of that ourselves.  */
19660   mask = aarch64_simd_gen_const_vector_dup (vmode,
19661       one_vector_p ? nelt - 1 : 2 * nelt - 1);
19662   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
19663
19664   /* For big-endian, we also need to reverse the index within the vector
19665      (but not which vector).  */
19666   if (BYTES_BIG_ENDIAN)
19667     {
19668       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
19669       if (!one_vector_p)
19670         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
19671       sel = expand_simple_binop (vmode, XOR, sel, mask,
19672                                  NULL, 0, OPTAB_LIB_WIDEN);
19673     }
19674   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
19675 }
19676
19677 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
19678
19679 static void
19680 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
19681 {
19682   emit_insn (gen_rtx_SET (target,
19683                           gen_rtx_UNSPEC (GET_MODE (target),
19684                                           gen_rtvec (2, op0, op1), code)));
19685 }
19686
19687 /* Expand an SVE vec_perm with the given operands.  */
19688
19689 void
19690 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
19691 {
19692   machine_mode data_mode = GET_MODE (target);
19693   machine_mode sel_mode = GET_MODE (sel);
19694   /* Enforced by the pattern condition.  */
19695   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
19696
19697   /* Note: vec_perm indices are supposed to wrap when they go beyond the
19698      size of the two value vectors, i.e. the upper bits of the indices
19699      are effectively ignored.  SVE TBL instead produces 0 for any
19700      out-of-range indices, so we need to modulo all the vec_perm indices
19701      to ensure they are all in range.  */
19702   rtx sel_reg = force_reg (sel_mode, sel);
19703
19704   /* Check if the sel only references the first values vector.  */
19705   if (GET_CODE (sel) == CONST_VECTOR
19706       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
19707     {
19708       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
19709       return;
19710     }
19711
19712   /* Check if the two values vectors are the same.  */
19713   if (rtx_equal_p (op0, op1))
19714     {
19715       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
19716       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
19717                                          NULL, 0, OPTAB_DIRECT);
19718       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
19719       return;
19720     }
19721
19722   /* Run TBL on for each value vector and combine the results.  */
19723
19724   rtx res0 = gen_reg_rtx (data_mode);
19725   rtx res1 = gen_reg_rtx (data_mode);
19726   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
19727   if (GET_CODE (sel) != CONST_VECTOR
19728       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
19729     {
19730       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
19731                                                        2 * nunits - 1);
19732       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
19733                                      NULL, 0, OPTAB_DIRECT);
19734     }
19735   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
19736   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
19737                                      NULL, 0, OPTAB_DIRECT);
19738   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
19739   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
19740     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
19741   else
19742     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
19743 }
19744
19745 /* Recognize patterns suitable for the TRN instructions.  */
19746 static bool
19747 aarch64_evpc_trn (struct expand_vec_perm_d *d)
19748 {
19749   HOST_WIDE_INT odd;
19750   poly_uint64 nelt = d->perm.length ();
19751   rtx out, in0, in1, x;
19752   machine_mode vmode = d->vmode;
19753
19754   if (GET_MODE_UNIT_SIZE (vmode) > 8)
19755     return false;
19756
19757   /* Note that these are little-endian tests.
19758      We correct for big-endian later.  */
19759   if (!d->perm[0].is_constant (&odd)
19760       || (odd != 0 && odd != 1)
19761       || !d->perm.series_p (0, 2, odd, 2)
19762       || !d->perm.series_p (1, 2, nelt + odd, 2))
19763     return false;
19764
19765   /* Success!  */
19766   if (d->testing_p)
19767     return true;
19768
19769   in0 = d->op0;
19770   in1 = d->op1;
19771   /* We don't need a big-endian lane correction for SVE; see the comment
19772      at the head of aarch64-sve.md for details.  */
19773   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
19774     {
19775       x = in0, in0 = in1, in1 = x;
19776       odd = !odd;
19777     }
19778   out = d->target;
19779
19780   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
19781                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
19782   return true;
19783 }
19784
19785 /* Recognize patterns suitable for the UZP instructions.  */
19786 static bool
19787 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
19788 {
19789   HOST_WIDE_INT odd;
19790   rtx out, in0, in1, x;
19791   machine_mode vmode = d->vmode;
19792
19793   if (GET_MODE_UNIT_SIZE (vmode) > 8)
19794     return false;
19795
19796   /* Note that these are little-endian tests.
19797      We correct for big-endian later.  */
19798   if (!d->perm[0].is_constant (&odd)
19799       || (odd != 0 && odd != 1)
19800       || !d->perm.series_p (0, 1, odd, 2))
19801     return false;
19802
19803   /* Success!  */
19804   if (d->testing_p)
19805     return true;
19806
19807   in0 = d->op0;
19808   in1 = d->op1;
19809   /* We don't need a big-endian lane correction for SVE; see the comment
19810      at the head of aarch64-sve.md for details.  */
19811   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
19812     {
19813       x = in0, in0 = in1, in1 = x;
19814       odd = !odd;
19815     }
19816   out = d->target;
19817
19818   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
19819                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
19820   return true;
19821 }
19822
19823 /* Recognize patterns suitable for the ZIP instructions.  */
19824 static bool
19825 aarch64_evpc_zip (struct expand_vec_perm_d *d)
19826 {
19827   unsigned int high;
19828   poly_uint64 nelt = d->perm.length ();
19829   rtx out, in0, in1, x;
19830   machine_mode vmode = d->vmode;
19831
19832   if (GET_MODE_UNIT_SIZE (vmode) > 8)
19833     return false;
19834
19835   /* Note that these are little-endian tests.
19836      We correct for big-endian later.  */
19837   poly_uint64 first = d->perm[0];
19838   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
19839       || !d->perm.series_p (0, 2, first, 1)
19840       || !d->perm.series_p (1, 2, first + nelt, 1))
19841     return false;
19842   high = maybe_ne (first, 0U);
19843
19844   /* Success!  */
19845   if (d->testing_p)
19846     return true;
19847
19848   in0 = d->op0;
19849   in1 = d->op1;
19850   /* We don't need a big-endian lane correction for SVE; see the comment
19851      at the head of aarch64-sve.md for details.  */
19852   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
19853     {
19854       x = in0, in0 = in1, in1 = x;
19855       high = !high;
19856     }
19857   out = d->target;
19858
19859   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
19860                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
19861   return true;
19862 }
19863
19864 /* Recognize patterns for the EXT insn.  */
19865
19866 static bool
19867 aarch64_evpc_ext (struct expand_vec_perm_d *d)
19868 {
19869   HOST_WIDE_INT location;
19870   rtx offset;
19871
19872   /* The first element always refers to the first vector.
19873      Check if the extracted indices are increasing by one.  */
19874   if (d->vec_flags == VEC_SVE_PRED
19875       || !d->perm[0].is_constant (&location)
19876       || !d->perm.series_p (0, 1, location, 1))
19877     return false;
19878
19879   /* Success! */
19880   if (d->testing_p)
19881     return true;
19882
19883   /* The case where (location == 0) is a no-op for both big- and little-endian,
19884      and is removed by the mid-end at optimization levels -O1 and higher.
19885
19886      We don't need a big-endian lane correction for SVE; see the comment
19887      at the head of aarch64-sve.md for details.  */
19888   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
19889     {
19890       /* After setup, we want the high elements of the first vector (stored
19891          at the LSB end of the register), and the low elements of the second
19892          vector (stored at the MSB end of the register). So swap.  */
19893       std::swap (d->op0, d->op1);
19894       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
19895          to_constant () is safe since this is restricted to Advanced SIMD
19896          vectors.  */
19897       location = d->perm.length ().to_constant () - location;
19898     }
19899
19900   offset = GEN_INT (location);
19901   emit_set_insn (d->target,
19902                  gen_rtx_UNSPEC (d->vmode,
19903                                  gen_rtvec (3, d->op0, d->op1, offset),
19904                                  UNSPEC_EXT));
19905   return true;
19906 }
19907
19908 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
19909    within each 64-bit, 32-bit or 16-bit granule.  */
19910
19911 static bool
19912 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
19913 {
19914   HOST_WIDE_INT diff;
19915   unsigned int i, size, unspec;
19916   machine_mode pred_mode;
19917
19918   if (d->vec_flags == VEC_SVE_PRED
19919       || !d->one_vector_p
19920       || !d->perm[0].is_constant (&diff))
19921     return false;
19922
19923   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
19924   if (size == 8)
19925     {
19926       unspec = UNSPEC_REV64;
19927       pred_mode = VNx2BImode;
19928     }
19929   else if (size == 4)
19930     {
19931       unspec = UNSPEC_REV32;
19932       pred_mode = VNx4BImode;
19933     }
19934   else if (size == 2)
19935     {
19936       unspec = UNSPEC_REV16;
19937       pred_mode = VNx8BImode;
19938     }
19939   else
19940     return false;
19941
19942   unsigned int step = diff + 1;
19943   for (i = 0; i < step; ++i)
19944     if (!d->perm.series_p (i, step, diff - i, step))
19945       return false;
19946
19947   /* Success! */
19948   if (d->testing_p)
19949     return true;
19950
19951   if (d->vec_flags == VEC_SVE_DATA)
19952     {
19953       machine_mode int_mode = aarch64_sve_int_mode (pred_mode);
19954       rtx target = gen_reg_rtx (int_mode);
19955       if (BYTES_BIG_ENDIAN)
19956         /* The act of taking a subreg between INT_MODE and d->vmode
19957            is itself a reversing operation on big-endian targets;
19958            see the comment at the head of aarch64-sve.md for details.
19959            First reinterpret OP0 as INT_MODE without using a subreg
19960            and without changing the contents.  */
19961         emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0));
19962       else
19963         {
19964           /* For SVE we use REV[BHW] unspecs derived from the element size
19965              of v->mode and vector modes whose elements have SIZE bytes.
19966              This ensures that the vector modes match the predicate modes.  */
19967           int unspec = aarch64_sve_rev_unspec (d->vmode);
19968           rtx pred = aarch64_ptrue_reg (pred_mode);
19969           emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred,
19970                                        gen_lowpart (int_mode, d->op0)));
19971         }
19972       emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19973       return true;
19974     }
19975   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
19976   emit_set_insn (d->target, src);
19977   return true;
19978 }
19979
19980 /* Recognize patterns for the REV insn, which reverses elements within
19981    a full vector.  */
19982
19983 static bool
19984 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
19985 {
19986   poly_uint64 nelt = d->perm.length ();
19987
19988   if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
19989     return false;
19990
19991   if (!d->perm.series_p (0, 1, nelt - 1, -1))
19992     return false;
19993
19994   /* Success! */
19995   if (d->testing_p)
19996     return true;
19997
19998   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
19999   emit_set_insn (d->target, src);
20000   return true;
20001 }
20002
20003 static bool
20004 aarch64_evpc_dup (struct expand_vec_perm_d *d)
20005 {
20006   rtx out = d->target;
20007   rtx in0;
20008   HOST_WIDE_INT elt;
20009   machine_mode vmode = d->vmode;
20010   rtx lane;
20011
20012   if (d->vec_flags == VEC_SVE_PRED
20013       || d->perm.encoding ().encoded_nelts () != 1
20014       || !d->perm[0].is_constant (&elt))
20015     return false;
20016
20017   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
20018     return false;
20019
20020   /* Success! */
20021   if (d->testing_p)
20022     return true;
20023
20024   /* The generic preparation in aarch64_expand_vec_perm_const_1
20025      swaps the operand order and the permute indices if it finds
20026      d->perm[0] to be in the second operand.  Thus, we can always
20027      use d->op0 and need not do any extra arithmetic to get the
20028      correct lane number.  */
20029   in0 = d->op0;
20030   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
20031
20032   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
20033   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
20034   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
20035   return true;
20036 }
20037
20038 static bool
20039 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
20040 {
20041   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
20042   machine_mode vmode = d->vmode;
20043
20044   /* Make sure that the indices are constant.  */
20045   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
20046   for (unsigned int i = 0; i < encoded_nelts; ++i)
20047     if (!d->perm[i].is_constant ())
20048       return false;
20049
20050   if (d->testing_p)
20051     return true;
20052
20053   /* Generic code will try constant permutation twice.  Once with the
20054      original mode and again with the elements lowered to QImode.
20055      So wait and don't do the selector expansion ourselves.  */
20056   if (vmode != V8QImode && vmode != V16QImode)
20057     return false;
20058
20059   /* to_constant is safe since this routine is specific to Advanced SIMD
20060      vectors.  */
20061   unsigned int nelt = d->perm.length ().to_constant ();
20062   for (unsigned int i = 0; i < nelt; ++i)
20063     /* If big-endian and two vectors we end up with a weird mixed-endian
20064        mode on NEON.  Reverse the index within each word but not the word
20065        itself.  to_constant is safe because we checked is_constant above.  */
20066     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
20067                         ? d->perm[i].to_constant () ^ (nelt - 1)
20068                         : d->perm[i].to_constant ());
20069
20070   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
20071   sel = force_reg (vmode, sel);
20072
20073   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
20074   return true;
20075 }
20076
20077 /* Try to implement D using an SVE TBL instruction.  */
20078
20079 static bool
20080 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
20081 {
20082   unsigned HOST_WIDE_INT nelt;
20083
20084   /* Permuting two variable-length vectors could overflow the
20085      index range.  */
20086   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
20087     return false;
20088
20089   if (d->testing_p)
20090     return true;
20091
20092   machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
20093   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
20094   if (d->one_vector_p)
20095     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
20096   else
20097     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
20098   return true;
20099 }
20100
20101 /* Try to implement D using SVE SEL instruction.  */
20102
20103 static bool
20104 aarch64_evpc_sel (struct expand_vec_perm_d *d)
20105 {
20106   machine_mode vmode = d->vmode;
20107   int unit_size = GET_MODE_UNIT_SIZE (vmode);
20108
20109   if (d->vec_flags != VEC_SVE_DATA
20110       || unit_size > 8)
20111     return false;
20112
20113   int n_patterns = d->perm.encoding ().npatterns ();
20114   poly_int64 vec_len = d->perm.length ();
20115
20116   for (int i = 0; i < n_patterns; ++i)
20117     if (!known_eq (d->perm[i], i)
20118         && !known_eq (d->perm[i], vec_len + i))
20119       return false;
20120
20121   for (int i = n_patterns; i < n_patterns * 2; i++)
20122     if (!d->perm.series_p (i, n_patterns, i, n_patterns)
20123         && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
20124       return false;
20125
20126   if (d->testing_p)
20127     return true;
20128
20129   machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
20130
20131   /* Build a predicate that is true when op0 elements should be used.  */
20132   rtx_vector_builder builder (pred_mode, n_patterns, 2);
20133   for (int i = 0; i < n_patterns * 2; i++)
20134     {
20135       rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
20136                                           : CONST0_RTX (BImode);
20137       builder.quick_push (elem);
20138     }
20139
20140   rtx const_vec = builder.build ();
20141   rtx pred = force_reg (pred_mode, const_vec);
20142   /* TARGET = PRED ? OP0 : OP1.  */
20143   emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
20144   return true;
20145 }
20146
20147 static bool
20148 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
20149 {
20150   /* The pattern matching functions above are written to look for a small
20151      number to begin the sequence (0, 1, N/2).  If we begin with an index
20152      from the second operand, we can swap the operands.  */
20153   poly_int64 nelt = d->perm.length ();
20154   if (known_ge (d->perm[0], nelt))
20155     {
20156       d->perm.rotate_inputs (1);
20157       std::swap (d->op0, d->op1);
20158     }
20159
20160   if ((d->vec_flags == VEC_ADVSIMD
20161        || d->vec_flags == VEC_SVE_DATA
20162        || d->vec_flags == VEC_SVE_PRED)
20163       && known_gt (nelt, 1))
20164     {
20165       if (aarch64_evpc_rev_local (d))
20166         return true;
20167       else if (aarch64_evpc_rev_global (d))
20168         return true;
20169       else if (aarch64_evpc_ext (d))
20170         return true;
20171       else if (aarch64_evpc_dup (d))
20172         return true;
20173       else if (aarch64_evpc_zip (d))
20174         return true;
20175       else if (aarch64_evpc_uzp (d))
20176         return true;
20177       else if (aarch64_evpc_trn (d))
20178         return true;
20179       else if (aarch64_evpc_sel (d))
20180         return true;
20181       if (d->vec_flags == VEC_SVE_DATA)
20182         return aarch64_evpc_sve_tbl (d);
20183       else if (d->vec_flags == VEC_ADVSIMD)
20184         return aarch64_evpc_tbl (d);
20185     }
20186   return false;
20187 }
20188
20189 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
20190
20191 static bool
20192 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
20193                                   rtx op1, const vec_perm_indices &sel)
20194 {
20195   struct expand_vec_perm_d d;
20196
20197   /* Check whether the mask can be applied to a single vector.  */
20198   if (sel.ninputs () == 1
20199       || (op0 && rtx_equal_p (op0, op1)))
20200     d.one_vector_p = true;
20201   else if (sel.all_from_input_p (0))
20202     {
20203       d.one_vector_p = true;
20204       op1 = op0;
20205     }
20206   else if (sel.all_from_input_p (1))
20207     {
20208       d.one_vector_p = true;
20209       op0 = op1;
20210     }
20211   else
20212     d.one_vector_p = false;
20213
20214   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
20215                      sel.nelts_per_input ());
20216   d.vmode = vmode;
20217   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
20218   d.target = target;
20219   d.op0 = op0;
20220   d.op1 = op1;
20221   d.testing_p = !target;
20222
20223   if (!d.testing_p)
20224     return aarch64_expand_vec_perm_const_1 (&d);
20225
20226   rtx_insn *last = get_last_insn ();
20227   bool ret = aarch64_expand_vec_perm_const_1 (&d);
20228   gcc_assert (last == get_last_insn ());
20229
20230   return ret;
20231 }
20232
20233 /* Generate a byte permute mask for a register of mode MODE,
20234    which has NUNITS units.  */
20235
20236 rtx
20237 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
20238 {
20239   /* We have to reverse each vector because we dont have
20240      a permuted load that can reverse-load according to ABI rules.  */
20241   rtx mask;
20242   rtvec v = rtvec_alloc (16);
20243   unsigned int i, j;
20244   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
20245
20246   gcc_assert (BYTES_BIG_ENDIAN);
20247   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
20248
20249   for (i = 0; i < nunits; i++)
20250     for (j = 0; j < usize; j++)
20251       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
20252   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
20253   return force_reg (V16QImode, mask);
20254 }
20255
20256 /* Expand an SVE integer comparison using the SVE equivalent of:
20257
20258      (set TARGET (CODE OP0 OP1)).  */
20259
20260 void
20261 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
20262 {
20263   machine_mode pred_mode = GET_MODE (target);
20264   machine_mode data_mode = GET_MODE (op0);
20265   rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
20266                                       op0, op1);
20267   if (!rtx_equal_p (target, res))
20268     emit_move_insn (target, res);
20269 }
20270
20271 /* Return the UNSPEC_COND_* code for comparison CODE.  */
20272
20273 static unsigned int
20274 aarch64_unspec_cond_code (rtx_code code)
20275 {
20276   switch (code)
20277     {
20278     case NE:
20279       return UNSPEC_COND_FCMNE;
20280     case EQ:
20281       return UNSPEC_COND_FCMEQ;
20282     case LT:
20283       return UNSPEC_COND_FCMLT;
20284     case GT:
20285       return UNSPEC_COND_FCMGT;
20286     case LE:
20287       return UNSPEC_COND_FCMLE;
20288     case GE:
20289       return UNSPEC_COND_FCMGE;
20290     case UNORDERED:
20291       return UNSPEC_COND_FCMUO;
20292     default:
20293       gcc_unreachable ();
20294     }
20295 }
20296
20297 /* Emit:
20298
20299       (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
20300
20301    where <X> is the operation associated with comparison CODE.
20302    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
20303
20304 static void
20305 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
20306                           bool known_ptrue_p, rtx op0, rtx op1)
20307 {
20308   rtx flag = gen_int_mode (known_ptrue_p, SImode);
20309   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
20310                                gen_rtvec (4, pred, flag, op0, op1),
20311                                aarch64_unspec_cond_code (code));
20312   emit_set_insn (target, unspec);
20313 }
20314
20315 /* Emit the SVE equivalent of:
20316
20317       (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
20318       (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
20319       (set TARGET (ior:PRED_MODE TMP1 TMP2))
20320
20321    where <Xi> is the operation associated with comparison CODEi.
20322    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
20323
20324 static void
20325 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
20326                               rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
20327 {
20328   machine_mode pred_mode = GET_MODE (pred);
20329   rtx tmp1 = gen_reg_rtx (pred_mode);
20330   aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
20331   rtx tmp2 = gen_reg_rtx (pred_mode);
20332   aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
20333   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
20334 }
20335
20336 /* Emit the SVE equivalent of:
20337
20338       (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
20339       (set TARGET (not TMP))
20340
20341    where <X> is the operation associated with comparison CODE.
20342    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
20343
20344 static void
20345 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
20346                                  bool known_ptrue_p, rtx op0, rtx op1)
20347 {
20348   machine_mode pred_mode = GET_MODE (pred);
20349   rtx tmp = gen_reg_rtx (pred_mode);
20350   aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
20351   aarch64_emit_unop (target, one_cmpl_optab, tmp);
20352 }
20353
20354 /* Expand an SVE floating-point comparison using the SVE equivalent of:
20355
20356      (set TARGET (CODE OP0 OP1))
20357
20358    If CAN_INVERT_P is true, the caller can also handle inverted results;
20359    return true if the result is in fact inverted.  */
20360
20361 bool
20362 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
20363                                   rtx op0, rtx op1, bool can_invert_p)
20364 {
20365   machine_mode pred_mode = GET_MODE (target);
20366   machine_mode data_mode = GET_MODE (op0);
20367
20368   rtx ptrue = aarch64_ptrue_reg (pred_mode);
20369   switch (code)
20370     {
20371     case UNORDERED:
20372       /* UNORDERED has no immediate form.  */
20373       op1 = force_reg (data_mode, op1);
20374       /* fall through */
20375     case LT:
20376     case LE:
20377     case GT:
20378     case GE:
20379     case EQ:
20380     case NE:
20381       {
20382         /* There is native support for the comparison.  */
20383         aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
20384         return false;
20385       }
20386
20387     case LTGT:
20388       /* This is a trapping operation (LT or GT).  */
20389       aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
20390       return false;
20391
20392     case UNEQ:
20393       if (!flag_trapping_math)
20394         {
20395           /* This would trap for signaling NaNs.  */
20396           op1 = force_reg (data_mode, op1);
20397           aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
20398                                         ptrue, true, op0, op1);
20399           return false;
20400         }
20401       /* fall through */
20402     case UNLT:
20403     case UNLE:
20404     case UNGT:
20405     case UNGE:
20406       if (flag_trapping_math)
20407         {
20408           /* Work out which elements are ordered.  */
20409           rtx ordered = gen_reg_rtx (pred_mode);
20410           op1 = force_reg (data_mode, op1);
20411           aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
20412                                            ptrue, true, op0, op1);
20413
20414           /* Test the opposite condition for the ordered elements,
20415              then invert the result.  */
20416           if (code == UNEQ)
20417             code = NE;
20418           else
20419             code = reverse_condition_maybe_unordered (code);
20420           if (can_invert_p)
20421             {
20422               aarch64_emit_sve_fp_cond (target, code,
20423                                         ordered, false, op0, op1);
20424               return true;
20425             }
20426           aarch64_emit_sve_invert_fp_cond (target, code,
20427                                            ordered, false, op0, op1);
20428           return false;
20429         }
20430       break;
20431
20432     case ORDERED:
20433       /* ORDERED has no immediate form.  */
20434       op1 = force_reg (data_mode, op1);
20435       break;
20436
20437     default:
20438       gcc_unreachable ();
20439     }
20440
20441   /* There is native support for the inverse comparison.  */
20442   code = reverse_condition_maybe_unordered (code);
20443   if (can_invert_p)
20444     {
20445       aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
20446       return true;
20447     }
20448   aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
20449   return false;
20450 }
20451
20452 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
20453    of the data being selected and CMP_MODE is the mode of the values being
20454    compared.  */
20455
20456 void
20457 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
20458                           rtx *ops)
20459 {
20460   machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
20461   rtx pred = gen_reg_rtx (pred_mode);
20462   if (FLOAT_MODE_P (cmp_mode))
20463     {
20464       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
20465                                             ops[4], ops[5], true))
20466         std::swap (ops[1], ops[2]);
20467     }
20468   else
20469     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
20470
20471   if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
20472     ops[1] = force_reg (data_mode, ops[1]);
20473   /* The "false" value can only be zero if the "true" value is a constant.  */
20474   if (register_operand (ops[1], data_mode)
20475       || !aarch64_simd_reg_or_zero (ops[2], data_mode))
20476     ops[2] = force_reg (data_mode, ops[2]);
20477
20478   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
20479   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
20480 }
20481
20482 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
20483    true.  However due to issues with register allocation it is preferable
20484    to avoid tieing integer scalar and FP scalar modes.  Executing integer
20485    operations in general registers is better than treating them as scalar
20486    vector operations.  This reduces latency and avoids redundant int<->FP
20487    moves.  So tie modes if they are either the same class, or vector modes
20488    with other vector modes, vector structs or any scalar mode.  */
20489
20490 static bool
20491 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
20492 {
20493   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
20494     return true;
20495
20496   /* We specifically want to allow elements of "structure" modes to
20497      be tieable to the structure.  This more general condition allows
20498      other rarer situations too.  The reason we don't extend this to
20499      predicate modes is that there are no predicate structure modes
20500      nor any specific instructions for extracting part of a predicate
20501      register.  */
20502   if (aarch64_vector_data_mode_p (mode1)
20503       && aarch64_vector_data_mode_p (mode2))
20504     return true;
20505
20506   /* Also allow any scalar modes with vectors.  */
20507   if (aarch64_vector_mode_supported_p (mode1)
20508       || aarch64_vector_mode_supported_p (mode2))
20509     return true;
20510
20511   return false;
20512 }
20513
20514 /* Return a new RTX holding the result of moving POINTER forward by
20515    AMOUNT bytes.  */
20516
20517 static rtx
20518 aarch64_move_pointer (rtx pointer, poly_int64 amount)
20519 {
20520   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
20521
20522   return adjust_automodify_address (pointer, GET_MODE (pointer),
20523                                     next, amount);
20524 }
20525
20526 /* Return a new RTX holding the result of moving POINTER forward by the
20527    size of the mode it points to.  */
20528
20529 static rtx
20530 aarch64_progress_pointer (rtx pointer)
20531 {
20532   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
20533 }
20534
20535 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
20536    MODE bytes.  */
20537
20538 static void
20539 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
20540                                               machine_mode mode)
20541 {
20542   rtx reg = gen_reg_rtx (mode);
20543
20544   /* "Cast" the pointers to the correct mode.  */
20545   *src = adjust_address (*src, mode, 0);
20546   *dst = adjust_address (*dst, mode, 0);
20547   /* Emit the memcpy.  */
20548   emit_move_insn (reg, *src);
20549   emit_move_insn (*dst, reg);
20550   /* Move the pointers forward.  */
20551   *src = aarch64_progress_pointer (*src);
20552   *dst = aarch64_progress_pointer (*dst);
20553 }
20554
20555 /* Expand cpymem, as if from a __builtin_memcpy.  Return true if
20556    we succeed, otherwise return false.  */
20557
20558 bool
20559 aarch64_expand_cpymem (rtx *operands)
20560 {
20561   int n, mode_bits;
20562   rtx dst = operands[0];
20563   rtx src = operands[1];
20564   rtx base;
20565   machine_mode cur_mode = BLKmode, next_mode;
20566   bool speed_p = !optimize_function_for_size_p (cfun);
20567
20568   /* When optimizing for size, give a better estimate of the length of a
20569      memcpy call, but use the default otherwise.  Moves larger than 8 bytes
20570      will always require an even number of instructions to do now.  And each
20571      operation requires both a load+store, so devide the max number by 2.  */
20572   int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
20573
20574   /* We can't do anything smart if the amount to copy is not constant.  */
20575   if (!CONST_INT_P (operands[2]))
20576     return false;
20577
20578   n = INTVAL (operands[2]);
20579
20580   /* Try to keep the number of instructions low.  For all cases we will do at
20581      most two moves for the residual amount, since we'll always overlap the
20582      remainder.  */
20583   if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
20584     return false;
20585
20586   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
20587   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
20588
20589   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
20590   src = adjust_automodify_address (src, VOIDmode, base, 0);
20591
20592   /* Convert n to bits to make the rest of the code simpler.  */
20593   n = n * BITS_PER_UNIT;
20594
20595   /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
20596      larger than TImode, but we should not use them for loads/stores here.  */
20597   const int copy_limit = GET_MODE_BITSIZE (TImode);
20598
20599   while (n > 0)
20600     {
20601       /* Find the largest mode in which to do the copy in without over reading
20602          or writing.  */
20603       opt_scalar_int_mode mode_iter;
20604       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
20605         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
20606           cur_mode = mode_iter.require ();
20607
20608       gcc_assert (cur_mode != BLKmode);
20609
20610       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
20611       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
20612
20613       n -= mode_bits;
20614
20615       /* Do certain trailing copies as overlapping if it's going to be
20616          cheaper.  i.e. less instructions to do so.  For instance doing a 15
20617          byte copy it's more efficient to do two overlapping 8 byte copies than
20618          8 + 6 + 1.  */
20619       if (n > 0 && n <= 8 * BITS_PER_UNIT)
20620         {
20621           next_mode = smallest_mode_for_size (n, MODE_INT);
20622           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
20623           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
20624           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
20625           n = n_bits;
20626         }
20627     }
20628
20629   return true;
20630 }
20631
20632 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
20633    SImode stores.  Handle the case when the constant has identical
20634    bottom and top halves.  This is beneficial when the two stores can be
20635    merged into an STP and we avoid synthesising potentially expensive
20636    immediates twice.  Return true if such a split is possible.  */
20637
20638 bool
20639 aarch64_split_dimode_const_store (rtx dst, rtx src)
20640 {
20641   rtx lo = gen_lowpart (SImode, src);
20642   rtx hi = gen_highpart_mode (SImode, DImode, src);
20643
20644   bool size_p = optimize_function_for_size_p (cfun);
20645
20646   if (!rtx_equal_p (lo, hi))
20647     return false;
20648
20649   unsigned int orig_cost
20650     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
20651   unsigned int lo_cost
20652     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
20653
20654   /* We want to transform:
20655      MOV        x1, 49370
20656      MOVK       x1, 0x140, lsl 16
20657      MOVK       x1, 0xc0da, lsl 32
20658      MOVK       x1, 0x140, lsl 48
20659      STR        x1, [x0]
20660    into:
20661      MOV        w1, 49370
20662      MOVK       w1, 0x140, lsl 16
20663      STP        w1, w1, [x0]
20664    So we want to perform this only when we save two instructions
20665    or more.  When optimizing for size, however, accept any code size
20666    savings we can.  */
20667   if (size_p && orig_cost <= lo_cost)
20668     return false;
20669
20670   if (!size_p
20671       && (orig_cost <= lo_cost + 1))
20672     return false;
20673
20674   rtx mem_lo = adjust_address (dst, SImode, 0);
20675   if (!aarch64_mem_pair_operand (mem_lo, SImode))
20676     return false;
20677
20678   rtx tmp_reg = gen_reg_rtx (SImode);
20679   aarch64_expand_mov_immediate (tmp_reg, lo);
20680   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
20681   /* Don't emit an explicit store pair as this may not be always profitable.
20682      Let the sched-fusion logic decide whether to merge them.  */
20683   emit_move_insn (mem_lo, tmp_reg);
20684   emit_move_insn (mem_hi, tmp_reg);
20685
20686   return true;
20687 }
20688
20689 /* Generate RTL for a conditional branch with rtx comparison CODE in
20690    mode CC_MODE.  The destination of the unlikely conditional branch
20691    is LABEL_REF.  */
20692
20693 void
20694 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
20695                               rtx label_ref)
20696 {
20697   rtx x;
20698   x = gen_rtx_fmt_ee (code, VOIDmode,
20699                       gen_rtx_REG (cc_mode, CC_REGNUM),
20700                       const0_rtx);
20701
20702   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
20703                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
20704                             pc_rtx);
20705   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
20706 }
20707
20708 /* Generate DImode scratch registers for 128-bit (TImode) addition.
20709
20710    OP1 represents the TImode destination operand 1
20711    OP2 represents the TImode destination operand 2
20712    LOW_DEST represents the low half (DImode) of TImode operand 0
20713    LOW_IN1 represents the low half (DImode) of TImode operand 1
20714    LOW_IN2 represents the low half (DImode) of TImode operand 2
20715    HIGH_DEST represents the high half (DImode) of TImode operand 0
20716    HIGH_IN1 represents the high half (DImode) of TImode operand 1
20717    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
20718
20719 void
20720 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
20721                             rtx *low_in1, rtx *low_in2,
20722                             rtx *high_dest, rtx *high_in1,
20723                             rtx *high_in2)
20724 {
20725   *low_dest = gen_reg_rtx (DImode);
20726   *low_in1 = gen_lowpart (DImode, op1);
20727   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
20728                                   subreg_lowpart_offset (DImode, TImode));
20729   *high_dest = gen_reg_rtx (DImode);
20730   *high_in1 = gen_highpart (DImode, op1);
20731   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
20732                                    subreg_highpart_offset (DImode, TImode));
20733 }
20734
20735 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
20736
20737    This function differs from 'arch64_addti_scratch_regs' in that
20738    OP1 can be an immediate constant (zero). We must call
20739    subreg_highpart_offset with DImode and TImode arguments, otherwise
20740    VOIDmode will be used for the const_int which generates an internal
20741    error from subreg_size_highpart_offset which does not expect a size of zero.
20742
20743    OP1 represents the TImode destination operand 1
20744    OP2 represents the TImode destination operand 2
20745    LOW_DEST represents the low half (DImode) of TImode operand 0
20746    LOW_IN1 represents the low half (DImode) of TImode operand 1
20747    LOW_IN2 represents the low half (DImode) of TImode operand 2
20748    HIGH_DEST represents the high half (DImode) of TImode operand 0
20749    HIGH_IN1 represents the high half (DImode) of TImode operand 1
20750    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
20751
20752
20753 void
20754 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
20755                              rtx *low_in1, rtx *low_in2,
20756                              rtx *high_dest, rtx *high_in1,
20757                              rtx *high_in2)
20758 {
20759   *low_dest = gen_reg_rtx (DImode);
20760   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
20761                                   subreg_lowpart_offset (DImode, TImode));
20762
20763   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
20764                                   subreg_lowpart_offset (DImode, TImode));
20765   *high_dest = gen_reg_rtx (DImode);
20766
20767   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
20768                                    subreg_highpart_offset (DImode, TImode));
20769   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
20770                                    subreg_highpart_offset (DImode, TImode));
20771 }
20772
20773 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
20774
20775    OP0 represents the TImode destination operand 0
20776    LOW_DEST represents the low half (DImode) of TImode operand 0
20777    LOW_IN1 represents the low half (DImode) of TImode operand 1
20778    LOW_IN2 represents the low half (DImode) of TImode operand 2
20779    HIGH_DEST represents the high half (DImode) of TImode operand 0
20780    HIGH_IN1 represents the high half (DImode) of TImode operand 1
20781    HIGH_IN2 represents the high half (DImode) of TImode operand 2
20782    UNSIGNED_P is true if the operation is being performed on unsigned
20783    values.  */
20784 void
20785 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
20786                        rtx low_in2, rtx high_dest, rtx high_in1,
20787                        rtx high_in2, bool unsigned_p)
20788 {
20789   if (low_in2 == const0_rtx)
20790     {
20791       low_dest = low_in1;
20792       high_in2 = force_reg (DImode, high_in2);
20793       if (unsigned_p)
20794         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
20795       else
20796         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
20797     }
20798   else
20799     {
20800       if (aarch64_plus_immediate (low_in2, DImode))
20801         emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
20802                                             GEN_INT (-INTVAL (low_in2))));
20803       else
20804         {
20805           low_in2 = force_reg (DImode, low_in2);
20806           emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
20807         }
20808       high_in2 = force_reg (DImode, high_in2);
20809
20810       if (unsigned_p)
20811         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
20812       else
20813         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
20814     }
20815
20816   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
20817   emit_move_insn (gen_highpart (DImode, op0), high_dest);
20818
20819 }
20820
20821 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
20822
20823 static unsigned HOST_WIDE_INT
20824 aarch64_asan_shadow_offset (void)
20825 {
20826   if (TARGET_ILP32)
20827     return (HOST_WIDE_INT_1 << 29);
20828   else
20829     return (HOST_WIDE_INT_1 << 36);
20830 }
20831
20832 static rtx
20833 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
20834                         int code, tree treeop0, tree treeop1)
20835 {
20836   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
20837   rtx op0, op1;
20838   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
20839   insn_code icode;
20840   struct expand_operand ops[4];
20841
20842   start_sequence ();
20843   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
20844
20845   op_mode = GET_MODE (op0);
20846   if (op_mode == VOIDmode)
20847     op_mode = GET_MODE (op1);
20848
20849   switch (op_mode)
20850     {
20851     case E_QImode:
20852     case E_HImode:
20853     case E_SImode:
20854       cmp_mode = SImode;
20855       icode = CODE_FOR_cmpsi;
20856       break;
20857
20858     case E_DImode:
20859       cmp_mode = DImode;
20860       icode = CODE_FOR_cmpdi;
20861       break;
20862
20863     case E_SFmode:
20864       cmp_mode = SFmode;
20865       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
20866       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
20867       break;
20868
20869     case E_DFmode:
20870       cmp_mode = DFmode;
20871       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
20872       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
20873       break;
20874
20875     default:
20876       end_sequence ();
20877       return NULL_RTX;
20878     }
20879
20880   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
20881   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
20882   if (!op0 || !op1)
20883     {
20884       end_sequence ();
20885       return NULL_RTX;
20886     }
20887   *prep_seq = get_insns ();
20888   end_sequence ();
20889
20890   create_fixed_operand (&ops[0], op0);
20891   create_fixed_operand (&ops[1], op1);
20892
20893   start_sequence ();
20894   if (!maybe_expand_insn (icode, 2, ops))
20895     {
20896       end_sequence ();
20897       return NULL_RTX;
20898     }
20899   *gen_seq = get_insns ();
20900   end_sequence ();
20901
20902   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
20903                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
20904 }
20905
20906 static rtx
20907 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
20908                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
20909 {
20910   rtx op0, op1, target;
20911   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
20912   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
20913   insn_code icode;
20914   struct expand_operand ops[6];
20915   int aarch64_cond;
20916
20917   push_to_sequence (*prep_seq);
20918   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
20919
20920   op_mode = GET_MODE (op0);
20921   if (op_mode == VOIDmode)
20922     op_mode = GET_MODE (op1);
20923
20924   switch (op_mode)
20925     {
20926     case E_QImode:
20927     case E_HImode:
20928     case E_SImode:
20929       cmp_mode = SImode;
20930       break;
20931
20932     case E_DImode:
20933       cmp_mode = DImode;
20934       break;
20935
20936     case E_SFmode:
20937       cmp_mode = SFmode;
20938       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
20939       break;
20940
20941     case E_DFmode:
20942       cmp_mode = DFmode;
20943       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
20944       break;
20945
20946     default:
20947       end_sequence ();
20948       return NULL_RTX;
20949     }
20950
20951   icode = code_for_ccmp (cc_mode, cmp_mode);
20952
20953   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
20954   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
20955   if (!op0 || !op1)
20956     {
20957       end_sequence ();
20958       return NULL_RTX;
20959     }
20960   *prep_seq = get_insns ();
20961   end_sequence ();
20962
20963   target = gen_rtx_REG (cc_mode, CC_REGNUM);
20964   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
20965
20966   if (bit_code != AND)
20967     {
20968       /* Treat the ccmp patterns as canonical and use them where possible,
20969          but fall back to ccmp_rev patterns if there's no other option.  */
20970       rtx_code prev_code = GET_CODE (prev);
20971       machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
20972       if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
20973           && !(prev_code == EQ
20974                || prev_code == NE
20975                || prev_code == ORDERED
20976                || prev_code == UNORDERED))
20977         icode = code_for_ccmp_rev (cc_mode, cmp_mode);
20978       else
20979         {
20980           rtx_code code = reverse_condition (prev_code);
20981           prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
20982         }
20983       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
20984     }
20985
20986   create_fixed_operand (&ops[0], XEXP (prev, 0));
20987   create_fixed_operand (&ops[1], target);
20988   create_fixed_operand (&ops[2], op0);
20989   create_fixed_operand (&ops[3], op1);
20990   create_fixed_operand (&ops[4], prev);
20991   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
20992
20993   push_to_sequence (*gen_seq);
20994   if (!maybe_expand_insn (icode, 6, ops))
20995     {
20996       end_sequence ();
20997       return NULL_RTX;
20998     }
20999
21000   *gen_seq = get_insns ();
21001   end_sequence ();
21002
21003   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
21004 }
21005
21006 #undef TARGET_GEN_CCMP_FIRST
21007 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
21008
21009 #undef TARGET_GEN_CCMP_NEXT
21010 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
21011
21012 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
21013    instruction fusion of some sort.  */
21014
21015 static bool
21016 aarch64_macro_fusion_p (void)
21017 {
21018   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
21019 }
21020
21021
21022 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
21023    should be kept together during scheduling.  */
21024
21025 static bool
21026 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
21027 {
21028   rtx set_dest;
21029   rtx prev_set = single_set (prev);
21030   rtx curr_set = single_set (curr);
21031   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
21032   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
21033
21034   if (!aarch64_macro_fusion_p ())
21035     return false;
21036
21037   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
21038     {
21039       /* We are trying to match:
21040          prev (mov)  == (set (reg r0) (const_int imm16))
21041          curr (movk) == (set (zero_extract (reg r0)
21042                                            (const_int 16)
21043                                            (const_int 16))
21044                              (const_int imm16_1))  */
21045
21046       set_dest = SET_DEST (curr_set);
21047
21048       if (GET_CODE (set_dest) == ZERO_EXTRACT
21049           && CONST_INT_P (SET_SRC (curr_set))
21050           && CONST_INT_P (SET_SRC (prev_set))
21051           && CONST_INT_P (XEXP (set_dest, 2))
21052           && INTVAL (XEXP (set_dest, 2)) == 16
21053           && REG_P (XEXP (set_dest, 0))
21054           && REG_P (SET_DEST (prev_set))
21055           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
21056         {
21057           return true;
21058         }
21059     }
21060
21061   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
21062     {
21063
21064       /*  We're trying to match:
21065           prev (adrp) == (set (reg r1)
21066                               (high (symbol_ref ("SYM"))))
21067           curr (add) == (set (reg r0)
21068                              (lo_sum (reg r1)
21069                                      (symbol_ref ("SYM"))))
21070           Note that r0 need not necessarily be the same as r1, especially
21071           during pre-regalloc scheduling.  */
21072
21073       if (satisfies_constraint_Ush (SET_SRC (prev_set))
21074           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
21075         {
21076           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
21077               && REG_P (XEXP (SET_SRC (curr_set), 0))
21078               && REGNO (XEXP (SET_SRC (curr_set), 0))
21079                  == REGNO (SET_DEST (prev_set))
21080               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
21081                               XEXP (SET_SRC (curr_set), 1)))
21082             return true;
21083         }
21084     }
21085
21086   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
21087     {
21088
21089       /* We're trying to match:
21090          prev (movk) == (set (zero_extract (reg r0)
21091                                            (const_int 16)
21092                                            (const_int 32))
21093                              (const_int imm16_1))
21094          curr (movk) == (set (zero_extract (reg r0)
21095                                            (const_int 16)
21096                                            (const_int 48))
21097                              (const_int imm16_2))  */
21098
21099       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
21100           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
21101           && REG_P (XEXP (SET_DEST (prev_set), 0))
21102           && REG_P (XEXP (SET_DEST (curr_set), 0))
21103           && REGNO (XEXP (SET_DEST (prev_set), 0))
21104              == REGNO (XEXP (SET_DEST (curr_set), 0))
21105           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
21106           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
21107           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
21108           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
21109           && CONST_INT_P (SET_SRC (prev_set))
21110           && CONST_INT_P (SET_SRC (curr_set)))
21111         return true;
21112
21113     }
21114   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
21115     {
21116       /* We're trying to match:
21117           prev (adrp) == (set (reg r0)
21118                               (high (symbol_ref ("SYM"))))
21119           curr (ldr) == (set (reg r1)
21120                              (mem (lo_sum (reg r0)
21121                                              (symbol_ref ("SYM")))))
21122                  or
21123           curr (ldr) == (set (reg r1)
21124                              (zero_extend (mem
21125                                            (lo_sum (reg r0)
21126                                                    (symbol_ref ("SYM"))))))  */
21127       if (satisfies_constraint_Ush (SET_SRC (prev_set))
21128           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
21129         {
21130           rtx curr_src = SET_SRC (curr_set);
21131
21132           if (GET_CODE (curr_src) == ZERO_EXTEND)
21133             curr_src = XEXP (curr_src, 0);
21134
21135           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
21136               && REG_P (XEXP (XEXP (curr_src, 0), 0))
21137               && REGNO (XEXP (XEXP (curr_src, 0), 0))
21138                  == REGNO (SET_DEST (prev_set))
21139               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
21140                               XEXP (SET_SRC (prev_set), 0)))
21141               return true;
21142         }
21143     }
21144
21145   /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch.  */
21146   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
21147       && prev_set && curr_set && any_condjump_p (curr)
21148       && GET_CODE (SET_SRC (prev_set)) == COMPARE
21149       && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
21150       && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
21151     return true;
21152
21153   /* Fuse flag-setting ALU instructions and conditional branch.  */
21154   if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
21155       && any_condjump_p (curr))
21156     {
21157       unsigned int condreg1, condreg2;
21158       rtx cc_reg_1;
21159       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
21160       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
21161
21162       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
21163           && prev
21164           && modified_in_p (cc_reg_1, prev))
21165         {
21166           enum attr_type prev_type = get_attr_type (prev);
21167
21168           /* FIXME: this misses some which is considered simple arthematic
21169              instructions for ThunderX.  Simple shifts are missed here.  */
21170           if (prev_type == TYPE_ALUS_SREG
21171               || prev_type == TYPE_ALUS_IMM
21172               || prev_type == TYPE_LOGICS_REG
21173               || prev_type == TYPE_LOGICS_IMM)
21174             return true;
21175         }
21176     }
21177
21178   /* Fuse ALU instructions and CBZ/CBNZ.  */
21179   if (prev_set
21180       && curr_set
21181       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
21182       && any_condjump_p (curr))
21183     {
21184       /* We're trying to match:
21185           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
21186           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
21187                                                          (const_int 0))
21188                                                  (label_ref ("SYM"))
21189                                                  (pc))  */
21190       if (SET_DEST (curr_set) == (pc_rtx)
21191           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
21192           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
21193           && REG_P (SET_DEST (prev_set))
21194           && REGNO (SET_DEST (prev_set))
21195              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
21196         {
21197           /* Fuse ALU operations followed by conditional branch instruction.  */
21198           switch (get_attr_type (prev))
21199             {
21200             case TYPE_ALU_IMM:
21201             case TYPE_ALU_SREG:
21202             case TYPE_ADC_REG:
21203             case TYPE_ADC_IMM:
21204             case TYPE_ADCS_REG:
21205             case TYPE_ADCS_IMM:
21206             case TYPE_LOGIC_REG:
21207             case TYPE_LOGIC_IMM:
21208             case TYPE_CSEL:
21209             case TYPE_ADR:
21210             case TYPE_MOV_IMM:
21211             case TYPE_SHIFT_REG:
21212             case TYPE_SHIFT_IMM:
21213             case TYPE_BFM:
21214             case TYPE_RBIT:
21215             case TYPE_REV:
21216             case TYPE_EXTEND:
21217               return true;
21218
21219             default:;
21220             }
21221         }
21222     }
21223
21224   return false;
21225 }
21226
21227 /* Return true iff the instruction fusion described by OP is enabled.  */
21228
21229 bool
21230 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
21231 {
21232   return (aarch64_tune_params.fusible_ops & op) != 0;
21233 }
21234
21235 /* If MEM is in the form of [base+offset], extract the two parts
21236    of address and set to BASE and OFFSET, otherwise return false
21237    after clearing BASE and OFFSET.  */
21238
21239 bool
21240 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
21241 {
21242   rtx addr;
21243
21244   gcc_assert (MEM_P (mem));
21245
21246   addr = XEXP (mem, 0);
21247
21248   if (REG_P (addr))
21249     {
21250       *base = addr;
21251       *offset = const0_rtx;
21252       return true;
21253     }
21254
21255   if (GET_CODE (addr) == PLUS
21256       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
21257     {
21258       *base = XEXP (addr, 0);
21259       *offset = XEXP (addr, 1);
21260       return true;
21261     }
21262
21263   *base = NULL_RTX;
21264   *offset = NULL_RTX;
21265
21266   return false;
21267 }
21268
21269 /* Types for scheduling fusion.  */
21270 enum sched_fusion_type
21271 {
21272   SCHED_FUSION_NONE = 0,
21273   SCHED_FUSION_LD_SIGN_EXTEND,
21274   SCHED_FUSION_LD_ZERO_EXTEND,
21275   SCHED_FUSION_LD,
21276   SCHED_FUSION_ST,
21277   SCHED_FUSION_NUM
21278 };
21279
21280 /* If INSN is a load or store of address in the form of [base+offset],
21281    extract the two parts and set to BASE and OFFSET.  Return scheduling
21282    fusion type this INSN is.  */
21283
21284 static enum sched_fusion_type
21285 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
21286 {
21287   rtx x, dest, src;
21288   enum sched_fusion_type fusion = SCHED_FUSION_LD;
21289
21290   gcc_assert (INSN_P (insn));
21291   x = PATTERN (insn);
21292   if (GET_CODE (x) != SET)
21293     return SCHED_FUSION_NONE;
21294
21295   src = SET_SRC (x);
21296   dest = SET_DEST (x);
21297
21298   machine_mode dest_mode = GET_MODE (dest);
21299
21300   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
21301     return SCHED_FUSION_NONE;
21302
21303   if (GET_CODE (src) == SIGN_EXTEND)
21304     {
21305       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
21306       src = XEXP (src, 0);
21307       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
21308         return SCHED_FUSION_NONE;
21309     }
21310   else if (GET_CODE (src) == ZERO_EXTEND)
21311     {
21312       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
21313       src = XEXP (src, 0);
21314       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
21315         return SCHED_FUSION_NONE;
21316     }
21317
21318   if (GET_CODE (src) == MEM && REG_P (dest))
21319     extract_base_offset_in_addr (src, base, offset);
21320   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
21321     {
21322       fusion = SCHED_FUSION_ST;
21323       extract_base_offset_in_addr (dest, base, offset);
21324     }
21325   else
21326     return SCHED_FUSION_NONE;
21327
21328   if (*base == NULL_RTX || *offset == NULL_RTX)
21329     fusion = SCHED_FUSION_NONE;
21330
21331   return fusion;
21332 }
21333
21334 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
21335
21336    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
21337    and PRI are only calculated for these instructions.  For other instruction,
21338    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
21339    type instruction fusion can be added by returning different priorities.
21340
21341    It's important that irrelevant instructions get the largest FUSION_PRI.  */
21342
21343 static void
21344 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
21345                                int *fusion_pri, int *pri)
21346 {
21347   int tmp, off_val;
21348   rtx base, offset;
21349   enum sched_fusion_type fusion;
21350
21351   gcc_assert (INSN_P (insn));
21352
21353   tmp = max_pri - 1;
21354   fusion = fusion_load_store (insn, &base, &offset);
21355   if (fusion == SCHED_FUSION_NONE)
21356     {
21357       *pri = tmp;
21358       *fusion_pri = tmp;
21359       return;
21360     }
21361
21362   /* Set FUSION_PRI according to fusion type and base register.  */
21363   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
21364
21365   /* Calculate PRI.  */
21366   tmp /= 2;
21367
21368   /* INSN with smaller offset goes first.  */
21369   off_val = (int)(INTVAL (offset));
21370   if (off_val >= 0)
21371     tmp -= (off_val & 0xfffff);
21372   else
21373     tmp += ((- off_val) & 0xfffff);
21374
21375   *pri = tmp;
21376   return;
21377 }
21378
21379 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
21380    Adjust priority of sha1h instructions so they are scheduled before
21381    other SHA1 instructions.  */
21382
21383 static int
21384 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
21385 {
21386   rtx x = PATTERN (insn);
21387
21388   if (GET_CODE (x) == SET)
21389     {
21390       x = SET_SRC (x);
21391
21392       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
21393         return priority + 10;
21394     }
21395
21396   return priority;
21397 }
21398
21399 /* Given OPERANDS of consecutive load/store, check if we can merge
21400    them into ldp/stp.  LOAD is true if they are load instructions.
21401    MODE is the mode of memory operands.  */
21402
21403 bool
21404 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
21405                                 machine_mode mode)
21406 {
21407   HOST_WIDE_INT offval_1, offval_2, msize;
21408   enum reg_class rclass_1, rclass_2;
21409   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
21410
21411   if (load)
21412     {
21413       mem_1 = operands[1];
21414       mem_2 = operands[3];
21415       reg_1 = operands[0];
21416       reg_2 = operands[2];
21417       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
21418       if (REGNO (reg_1) == REGNO (reg_2))
21419         return false;
21420     }
21421   else
21422     {
21423       mem_1 = operands[0];
21424       mem_2 = operands[2];
21425       reg_1 = operands[1];
21426       reg_2 = operands[3];
21427     }
21428
21429   /* The mems cannot be volatile.  */
21430   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
21431     return false;
21432
21433   /* If we have SImode and slow unaligned ldp,
21434      check the alignment to be at least 8 byte. */
21435   if (mode == SImode
21436       && (aarch64_tune_params.extra_tuning_flags
21437           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
21438       && !optimize_size
21439       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
21440     return false;
21441
21442   /* Check if the addresses are in the form of [base+offset].  */
21443   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
21444   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
21445     return false;
21446   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
21447   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
21448     return false;
21449
21450   /* Check if the bases are same.  */
21451   if (!rtx_equal_p (base_1, base_2))
21452     return false;
21453
21454   /* The operands must be of the same size.  */
21455   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
21456                          GET_MODE_SIZE (GET_MODE (mem_2))));
21457
21458   offval_1 = INTVAL (offset_1);
21459   offval_2 = INTVAL (offset_2);
21460   /* We should only be trying this for fixed-sized modes.  There is no
21461      SVE LDP/STP instruction.  */
21462   msize = GET_MODE_SIZE (mode).to_constant ();
21463   /* Check if the offsets are consecutive.  */
21464   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
21465     return false;
21466
21467   /* Check if the addresses are clobbered by load.  */
21468   if (load)
21469     {
21470       if (reg_mentioned_p (reg_1, mem_1))
21471         return false;
21472
21473       /* In increasing order, the last load can clobber the address.  */
21474       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
21475         return false;
21476     }
21477
21478   /* One of the memory accesses must be a mempair operand.
21479      If it is not the first one, they need to be swapped by the
21480      peephole.  */
21481   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
21482        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
21483     return false;
21484
21485   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
21486     rclass_1 = FP_REGS;
21487   else
21488     rclass_1 = GENERAL_REGS;
21489
21490   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
21491     rclass_2 = FP_REGS;
21492   else
21493     rclass_2 = GENERAL_REGS;
21494
21495   /* Check if the registers are of same class.  */
21496   if (rclass_1 != rclass_2)
21497     return false;
21498
21499   return true;
21500 }
21501
21502 /* Given OPERANDS of consecutive load/store that can be merged,
21503    swap them if they are not in ascending order.  */
21504 void
21505 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
21506 {
21507   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
21508   HOST_WIDE_INT offval_1, offval_2;
21509
21510   if (load)
21511     {
21512       mem_1 = operands[1];
21513       mem_2 = operands[3];
21514     }
21515   else
21516     {
21517       mem_1 = operands[0];
21518       mem_2 = operands[2];
21519     }
21520
21521   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
21522   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
21523
21524   offval_1 = INTVAL (offset_1);
21525   offval_2 = INTVAL (offset_2);
21526
21527   if (offval_1 > offval_2)
21528     {
21529       /* Irrespective of whether this is a load or a store,
21530          we do the same swap.  */
21531       std::swap (operands[0], operands[2]);
21532       std::swap (operands[1], operands[3]);
21533     }
21534 }
21535
21536 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
21537    comparison between the two.  */
21538 int
21539 aarch64_host_wide_int_compare (const void *x, const void *y)
21540 {
21541   return wi::cmps (* ((const HOST_WIDE_INT *) x),
21542                    * ((const HOST_WIDE_INT *) y));
21543 }
21544
21545 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
21546    other pointing to a REG rtx containing an offset, compare the offsets
21547    of the two pairs.
21548
21549    Return:
21550
21551         1 iff offset (X) > offset (Y)
21552         0 iff offset (X) == offset (Y)
21553         -1 iff offset (X) < offset (Y)  */
21554 int
21555 aarch64_ldrstr_offset_compare (const void *x, const void *y)
21556 {
21557   const rtx * operands_1 = (const rtx *) x;
21558   const rtx * operands_2 = (const rtx *) y;
21559   rtx mem_1, mem_2, base, offset_1, offset_2;
21560
21561   if (MEM_P (operands_1[0]))
21562     mem_1 = operands_1[0];
21563   else
21564     mem_1 = operands_1[1];
21565
21566   if (MEM_P (operands_2[0]))
21567     mem_2 = operands_2[0];
21568   else
21569     mem_2 = operands_2[1];
21570
21571   /* Extract the offsets.  */
21572   extract_base_offset_in_addr (mem_1, &base, &offset_1);
21573   extract_base_offset_in_addr (mem_2, &base, &offset_2);
21574
21575   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
21576
21577   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
21578 }
21579
21580 /* Given OPERANDS of consecutive load/store, check if we can merge
21581    them into ldp/stp by adjusting the offset.  LOAD is true if they
21582    are load instructions.  MODE is the mode of memory operands.
21583
21584    Given below consecutive stores:
21585
21586      str  w1, [xb, 0x100]
21587      str  w1, [xb, 0x104]
21588      str  w1, [xb, 0x108]
21589      str  w1, [xb, 0x10c]
21590
21591    Though the offsets are out of the range supported by stp, we can
21592    still pair them after adjusting the offset, like:
21593
21594      add  scratch, xb, 0x100
21595      stp  w1, w1, [scratch]
21596      stp  w1, w1, [scratch, 0x8]
21597
21598    The peephole patterns detecting this opportunity should guarantee
21599    the scratch register is avaliable.  */
21600
21601 bool
21602 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
21603                                        scalar_mode mode)
21604 {
21605   const int num_insns = 4;
21606   enum reg_class rclass;
21607   HOST_WIDE_INT offvals[num_insns], msize;
21608   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
21609
21610   if (load)
21611     {
21612       for (int i = 0; i < num_insns; i++)
21613         {
21614           reg[i] = operands[2 * i];
21615           mem[i] = operands[2 * i + 1];
21616
21617           gcc_assert (REG_P (reg[i]));
21618         }
21619
21620       /* Do not attempt to merge the loads if the loads clobber each other.  */
21621       for (int i = 0; i < 8; i += 2)
21622         for (int j = i + 2; j < 8; j += 2)
21623           if (reg_overlap_mentioned_p (operands[i], operands[j]))
21624             return false;
21625     }
21626   else
21627     for (int i = 0; i < num_insns; i++)
21628       {
21629         mem[i] = operands[2 * i];
21630         reg[i] = operands[2 * i + 1];
21631       }
21632
21633   /* Skip if memory operand is by itself valid for ldp/stp.  */
21634   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
21635     return false;
21636
21637   for (int i = 0; i < num_insns; i++)
21638     {
21639       /* The mems cannot be volatile.  */
21640       if (MEM_VOLATILE_P (mem[i]))
21641         return false;
21642
21643       /* Check if the addresses are in the form of [base+offset].  */
21644       extract_base_offset_in_addr (mem[i], base + i, offset + i);
21645       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
21646         return false;
21647     }
21648
21649   /* Check if the registers are of same class.  */
21650   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
21651     ? FP_REGS : GENERAL_REGS;
21652
21653   for (int i = 1; i < num_insns; i++)
21654     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
21655       {
21656         if (rclass != FP_REGS)
21657           return false;
21658       }
21659     else
21660       {
21661         if (rclass != GENERAL_REGS)
21662           return false;
21663       }
21664
21665   /* Only the last register in the order in which they occur
21666      may be clobbered by the load.  */
21667   if (rclass == GENERAL_REGS && load)
21668     for (int i = 0; i < num_insns - 1; i++)
21669       if (reg_mentioned_p (reg[i], mem[i]))
21670         return false;
21671
21672   /* Check if the bases are same.  */
21673   for (int i = 0; i < num_insns - 1; i++)
21674     if (!rtx_equal_p (base[i], base[i + 1]))
21675       return false;
21676
21677   for (int i = 0; i < num_insns; i++)
21678     offvals[i] = INTVAL (offset[i]);
21679
21680   msize = GET_MODE_SIZE (mode);
21681
21682   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
21683   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
21684          aarch64_host_wide_int_compare);
21685
21686   if (!(offvals[1] == offvals[0] + msize
21687         && offvals[3] == offvals[2] + msize))
21688     return false;
21689
21690   /* Check that offsets are within range of each other.  The ldp/stp
21691      instructions have 7 bit immediate offsets, so use 0x80.  */
21692   if (offvals[2] - offvals[0] >= msize * 0x80)
21693     return false;
21694
21695   /* The offsets must be aligned with respect to each other.  */
21696   if (offvals[0] % msize != offvals[2] % msize)
21697     return false;
21698
21699   /* If we have SImode and slow unaligned ldp,
21700      check the alignment to be at least 8 byte. */
21701   if (mode == SImode
21702       && (aarch64_tune_params.extra_tuning_flags
21703           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
21704       && !optimize_size
21705       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
21706     return false;
21707
21708   return true;
21709 }
21710
21711 /* Given OPERANDS of consecutive load/store, this function pairs them
21712    into LDP/STP after adjusting the offset.  It depends on the fact
21713    that the operands can be sorted so the offsets are correct for STP.
21714    MODE is the mode of memory operands.  CODE is the rtl operator
21715    which should be applied to all memory operands, it's SIGN_EXTEND,
21716    ZERO_EXTEND or UNKNOWN.  */
21717
21718 bool
21719 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
21720                              scalar_mode mode, RTX_CODE code)
21721 {
21722   rtx base, offset_1, offset_3, t1, t2;
21723   rtx mem_1, mem_2, mem_3, mem_4;
21724   rtx temp_operands[8];
21725   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
21726                 stp_off_upper_limit, stp_off_lower_limit, msize;
21727
21728   /* We make changes on a copy as we may still bail out.  */
21729   for (int i = 0; i < 8; i ++)
21730     temp_operands[i] = operands[i];
21731
21732   /* Sort the operands.  */
21733   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
21734
21735   /* Copy the memory operands so that if we have to bail for some
21736      reason the original addresses are unchanged.  */
21737   if (load)
21738     {
21739       mem_1 = copy_rtx (temp_operands[1]);
21740       mem_2 = copy_rtx (temp_operands[3]);
21741       mem_3 = copy_rtx (temp_operands[5]);
21742       mem_4 = copy_rtx (temp_operands[7]);
21743     }
21744   else
21745     {
21746       mem_1 = copy_rtx (temp_operands[0]);
21747       mem_2 = copy_rtx (temp_operands[2]);
21748       mem_3 = copy_rtx (temp_operands[4]);
21749       mem_4 = copy_rtx (temp_operands[6]);
21750       gcc_assert (code == UNKNOWN);
21751     }
21752
21753   extract_base_offset_in_addr (mem_1, &base, &offset_1);
21754   extract_base_offset_in_addr (mem_3, &base, &offset_3);
21755   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
21756               && offset_3 != NULL_RTX);
21757
21758   /* Adjust offset so it can fit in LDP/STP instruction.  */
21759   msize = GET_MODE_SIZE (mode);
21760   stp_off_upper_limit = msize * (0x40 - 1);
21761   stp_off_lower_limit = - msize * 0x40;
21762
21763   off_val_1 = INTVAL (offset_1);
21764   off_val_3 = INTVAL (offset_3);
21765
21766   /* The base offset is optimally half way between the two STP/LDP offsets.  */
21767   if (msize <= 4)
21768     base_off = (off_val_1 + off_val_3) / 2;
21769   else
21770     /* However, due to issues with negative LDP/STP offset generation for
21771        larger modes, for DF, DI and vector modes. we must not use negative
21772        addresses smaller than 9 signed unadjusted bits can store.  This
21773        provides the most range in this case.  */
21774     base_off = off_val_1;
21775
21776   /* Adjust the base so that it is aligned with the addresses but still
21777      optimal.  */
21778   if (base_off % msize != off_val_1 % msize)
21779     /* Fix the offset, bearing in mind we want to make it bigger not
21780        smaller.  */
21781     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
21782   else if (msize <= 4)
21783     /* The negative range of LDP/STP is one larger than the positive range.  */
21784     base_off += msize;
21785
21786   /* Check if base offset is too big or too small.  We can attempt to resolve
21787      this issue by setting it to the maximum value and seeing if the offsets
21788      still fit.  */
21789   if (base_off >= 0x1000)
21790     {
21791       base_off = 0x1000 - 1;
21792       /* We must still make sure that the base offset is aligned with respect
21793          to the address.  But it may not be made any bigger.  */
21794       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
21795     }
21796
21797   /* Likewise for the case where the base is too small.  */
21798   if (base_off <= -0x1000)
21799     {
21800       base_off = -0x1000 + 1;
21801       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
21802     }
21803
21804   /* Offset of the first STP/LDP.  */
21805   new_off_1 = off_val_1 - base_off;
21806
21807   /* Offset of the second STP/LDP.  */
21808   new_off_3 = off_val_3 - base_off;
21809
21810   /* The offsets must be within the range of the LDP/STP instructions.  */
21811   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
21812       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
21813     return false;
21814
21815   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
21816                                                   new_off_1), true);
21817   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
21818                                                   new_off_1 + msize), true);
21819   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
21820                                                   new_off_3), true);
21821   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
21822                                                   new_off_3 + msize), true);
21823
21824   if (!aarch64_mem_pair_operand (mem_1, mode)
21825       || !aarch64_mem_pair_operand (mem_3, mode))
21826     return false;
21827
21828   if (code == ZERO_EXTEND)
21829     {
21830       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
21831       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
21832       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
21833       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
21834     }
21835   else if (code == SIGN_EXTEND)
21836     {
21837       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
21838       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
21839       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
21840       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
21841     }
21842
21843   if (load)
21844     {
21845       operands[0] = temp_operands[0];
21846       operands[1] = mem_1;
21847       operands[2] = temp_operands[2];
21848       operands[3] = mem_2;
21849       operands[4] = temp_operands[4];
21850       operands[5] = mem_3;
21851       operands[6] = temp_operands[6];
21852       operands[7] = mem_4;
21853     }
21854   else
21855     {
21856       operands[0] = mem_1;
21857       operands[1] = temp_operands[1];
21858       operands[2] = mem_2;
21859       operands[3] = temp_operands[3];
21860       operands[4] = mem_3;
21861       operands[5] = temp_operands[5];
21862       operands[6] = mem_4;
21863       operands[7] = temp_operands[7];
21864     }
21865
21866   /* Emit adjusting instruction.  */
21867   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
21868   /* Emit ldp/stp instructions.  */
21869   t1 = gen_rtx_SET (operands[0], operands[1]);
21870   t2 = gen_rtx_SET (operands[2], operands[3]);
21871   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
21872   t1 = gen_rtx_SET (operands[4], operands[5]);
21873   t2 = gen_rtx_SET (operands[6], operands[7]);
21874   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
21875   return true;
21876 }
21877
21878 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
21879    it isn't worth branching around empty masked ops (including masked
21880    stores).  */
21881
21882 static bool
21883 aarch64_empty_mask_is_expensive (unsigned)
21884 {
21885   return false;
21886 }
21887
21888 /* Return 1 if pseudo register should be created and used to hold
21889    GOT address for PIC code.  */
21890
21891 bool
21892 aarch64_use_pseudo_pic_reg (void)
21893 {
21894   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
21895 }
21896
21897 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
21898
21899 static int
21900 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
21901 {
21902   switch (XINT (x, 1))
21903     {
21904     case UNSPEC_GOTSMALLPIC:
21905     case UNSPEC_GOTSMALLPIC28K:
21906     case UNSPEC_GOTTINYPIC:
21907       return 0;
21908     default:
21909       break;
21910     }
21911
21912   return default_unspec_may_trap_p (x, flags);
21913 }
21914
21915
21916 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
21917    return the log2 of that value.  Otherwise return -1.  */
21918
21919 int
21920 aarch64_fpconst_pow_of_2 (rtx x)
21921 {
21922   const REAL_VALUE_TYPE *r;
21923
21924   if (!CONST_DOUBLE_P (x))
21925     return -1;
21926
21927   r = CONST_DOUBLE_REAL_VALUE (x);
21928
21929   if (REAL_VALUE_NEGATIVE (*r)
21930       || REAL_VALUE_ISNAN (*r)
21931       || REAL_VALUE_ISINF (*r)
21932       || !real_isinteger (r, DFmode))
21933     return -1;
21934
21935   return exact_log2 (real_to_integer (r));
21936 }
21937
21938 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
21939    power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
21940    return n. Otherwise return -1.  */
21941
21942 int
21943 aarch64_fpconst_pow2_recip (rtx x)
21944 {
21945   REAL_VALUE_TYPE r0;
21946
21947   if (!CONST_DOUBLE_P (x))
21948     return -1;
21949
21950   r0 = *CONST_DOUBLE_REAL_VALUE (x);
21951   if (exact_real_inverse (DFmode, &r0)
21952       && !REAL_VALUE_NEGATIVE (r0))
21953     {
21954         int ret = exact_log2 (real_to_integer (&r0));
21955         if (ret >= 1 && ret <= 32)
21956             return ret;
21957     }
21958   return -1;
21959 }
21960
21961 /* If X is a vector of equal CONST_DOUBLE values and that value is
21962    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
21963
21964 int
21965 aarch64_vec_fpconst_pow_of_2 (rtx x)
21966 {
21967   int nelts;
21968   if (GET_CODE (x) != CONST_VECTOR
21969       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
21970     return -1;
21971
21972   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
21973     return -1;
21974
21975   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
21976   if (firstval <= 0)
21977     return -1;
21978
21979   for (int i = 1; i < nelts; i++)
21980     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
21981       return -1;
21982
21983   return firstval;
21984 }
21985
21986 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
21987    to float.
21988
21989    __fp16 always promotes through this hook.
21990    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
21991    through the generic excess precision logic rather than here.  */
21992
21993 static tree
21994 aarch64_promoted_type (const_tree t)
21995 {
21996   if (SCALAR_FLOAT_TYPE_P (t)
21997       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
21998     return float_type_node;
21999
22000   return NULL_TREE;
22001 }
22002
22003 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
22004
22005 static bool
22006 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
22007                            optimization_type opt_type)
22008 {
22009   switch (op)
22010     {
22011     case rsqrt_optab:
22012       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
22013
22014     default:
22015       return true;
22016     }
22017 }
22018
22019 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
22020
22021 static unsigned int
22022 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
22023                                         int *offset)
22024 {
22025   /* Polynomial invariant 1 == (VG / 2) - 1.  */
22026   gcc_assert (i == 1);
22027   *factor = 2;
22028   *offset = 1;
22029   return AARCH64_DWARF_VG;
22030 }
22031
22032 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
22033    if MODE is HFmode, and punt to the generic implementation otherwise.  */
22034
22035 static bool
22036 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
22037 {
22038   return (mode == HFmode
22039           ? true
22040           : default_libgcc_floating_mode_supported_p (mode));
22041 }
22042
22043 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
22044    if MODE is HFmode, and punt to the generic implementation otherwise.  */
22045
22046 static bool
22047 aarch64_scalar_mode_supported_p (scalar_mode mode)
22048 {
22049   return (mode == HFmode
22050           ? true
22051           : default_scalar_mode_supported_p (mode));
22052 }
22053
22054 /* Set the value of FLT_EVAL_METHOD.
22055    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
22056
22057     0: evaluate all operations and constants, whose semantic type has at
22058        most the range and precision of type float, to the range and
22059        precision of float; evaluate all other operations and constants to
22060        the range and precision of the semantic type;
22061
22062     N, where _FloatN is a supported interchange floating type
22063        evaluate all operations and constants, whose semantic type has at
22064        most the range and precision of _FloatN type, to the range and
22065        precision of the _FloatN type; evaluate all other operations and
22066        constants to the range and precision of the semantic type;
22067
22068    If we have the ARMv8.2-A extensions then we support _Float16 in native
22069    precision, so we should set this to 16.  Otherwise, we support the type,
22070    but want to evaluate expressions in float precision, so set this to
22071    0.  */
22072
22073 static enum flt_eval_method
22074 aarch64_excess_precision (enum excess_precision_type type)
22075 {
22076   switch (type)
22077     {
22078       case EXCESS_PRECISION_TYPE_FAST:
22079       case EXCESS_PRECISION_TYPE_STANDARD:
22080         /* We can calculate either in 16-bit range and precision or
22081            32-bit range and precision.  Make that decision based on whether
22082            we have native support for the ARMv8.2-A 16-bit floating-point
22083            instructions or not.  */
22084         return (TARGET_FP_F16INST
22085                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
22086                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
22087       case EXCESS_PRECISION_TYPE_IMPLICIT:
22088         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
22089       default:
22090         gcc_unreachable ();
22091     }
22092   return FLT_EVAL_METHOD_UNPREDICTABLE;
22093 }
22094
22095 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
22096    scheduled for speculative execution.  Reject the long-running division
22097    and square-root instructions.  */
22098
22099 static bool
22100 aarch64_sched_can_speculate_insn (rtx_insn *insn)
22101 {
22102   switch (get_attr_type (insn))
22103     {
22104       case TYPE_SDIV:
22105       case TYPE_UDIV:
22106       case TYPE_FDIVS:
22107       case TYPE_FDIVD:
22108       case TYPE_FSQRTS:
22109       case TYPE_FSQRTD:
22110       case TYPE_NEON_FP_SQRT_S:
22111       case TYPE_NEON_FP_SQRT_D:
22112       case TYPE_NEON_FP_SQRT_S_Q:
22113       case TYPE_NEON_FP_SQRT_D_Q:
22114       case TYPE_NEON_FP_DIV_S:
22115       case TYPE_NEON_FP_DIV_D:
22116       case TYPE_NEON_FP_DIV_S_Q:
22117       case TYPE_NEON_FP_DIV_D_Q:
22118         return false;
22119       default:
22120         return true;
22121     }
22122 }
22123
22124 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
22125
22126 static int
22127 aarch64_compute_pressure_classes (reg_class *classes)
22128 {
22129   int i = 0;
22130   classes[i++] = GENERAL_REGS;
22131   classes[i++] = FP_REGS;
22132   /* PR_REGS isn't a useful pressure class because many predicate pseudo
22133      registers need to go in PR_LO_REGS at some point during their
22134      lifetime.  Splitting it into two halves has the effect of making
22135      all predicates count against PR_LO_REGS, so that we try whenever
22136      possible to restrict the number of live predicates to 8.  This
22137      greatly reduces the amount of spilling in certain loops.  */
22138   classes[i++] = PR_LO_REGS;
22139   classes[i++] = PR_HI_REGS;
22140   return i;
22141 }
22142
22143 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
22144
22145 static bool
22146 aarch64_can_change_mode_class (machine_mode from,
22147                                machine_mode to, reg_class_t)
22148 {
22149   unsigned int from_flags = aarch64_classify_vector_mode (from);
22150   unsigned int to_flags = aarch64_classify_vector_mode (to);
22151
22152   bool from_sve_p = (from_flags & VEC_ANY_SVE);
22153   bool to_sve_p = (to_flags & VEC_ANY_SVE);
22154
22155   bool from_partial_sve_p = from_sve_p && (from_flags & VEC_PARTIAL);
22156   bool to_partial_sve_p = to_sve_p && (to_flags & VEC_PARTIAL);
22157
22158   bool from_pred_p = (from_flags & VEC_SVE_PRED);
22159   bool to_pred_p = (to_flags & VEC_SVE_PRED);
22160
22161   /* Don't allow changes between predicate modes and other modes.
22162      Only predicate registers can hold predicate modes and only
22163      non-predicate registers can hold non-predicate modes, so any
22164      attempt to mix them would require a round trip through memory.  */
22165   if (from_pred_p != to_pred_p)
22166     return false;
22167
22168   /* Don't allow changes between partial SVE modes and other modes.
22169      The contents of partial SVE modes are distributed evenly across
22170      the register, whereas GCC expects them to be clustered together.  */
22171   if (from_partial_sve_p != to_partial_sve_p)
22172     return false;
22173
22174   /* Similarly reject changes between partial SVE modes that have
22175      different patterns of significant and insignificant bits.  */
22176   if (from_partial_sve_p
22177       && (aarch64_sve_container_bits (from) != aarch64_sve_container_bits (to)
22178           || GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to)))
22179     return false;
22180
22181   if (maybe_ne (BITS_PER_SVE_VECTOR, 128u))
22182     {
22183       /* Don't allow changes between SVE modes and other modes that might
22184          be bigger than 128 bits.  In particular, OImode, CImode and XImode
22185          divide into 128-bit quantities while SVE modes divide into
22186          BITS_PER_SVE_VECTOR quantities.  */
22187       if (from_sve_p && !to_sve_p && maybe_gt (GET_MODE_BITSIZE (to), 128))
22188         return false;
22189       if (to_sve_p && !from_sve_p && maybe_gt (GET_MODE_BITSIZE (from), 128))
22190         return false;
22191     }
22192
22193   if (BYTES_BIG_ENDIAN)
22194     {
22195       /* Don't allow changes between SVE data modes and non-SVE modes.
22196          See the comment at the head of aarch64-sve.md for details.  */
22197       if (from_sve_p != to_sve_p)
22198         return false;
22199
22200       /* Don't allow changes in element size: lane 0 of the new vector
22201          would not then be lane 0 of the old vector.  See the comment
22202          above aarch64_maybe_expand_sve_subreg_move for a more detailed
22203          description.
22204
22205          In the worst case, this forces a register to be spilled in
22206          one mode and reloaded in the other, which handles the
22207          endianness correctly.  */
22208       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
22209         return false;
22210     }
22211   return true;
22212 }
22213
22214 /* Implement TARGET_EARLY_REMAT_MODES.  */
22215
22216 static void
22217 aarch64_select_early_remat_modes (sbitmap modes)
22218 {
22219   /* SVE values are not normally live across a call, so it should be
22220      worth doing early rematerialization even in VL-specific mode.  */
22221   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
22222     if (aarch64_sve_mode_p ((machine_mode) i))
22223       bitmap_set_bit (modes, i);
22224 }
22225
22226 /* Override the default target speculation_safe_value.  */
22227 static rtx
22228 aarch64_speculation_safe_value (machine_mode mode,
22229                                 rtx result, rtx val, rtx failval)
22230 {
22231   /* Maybe we should warn if falling back to hard barriers.  They are
22232      likely to be noticably more expensive than the alternative below.  */
22233   if (!aarch64_track_speculation)
22234     return default_speculation_safe_value (mode, result, val, failval);
22235
22236   if (!REG_P (val))
22237     val = copy_to_mode_reg (mode, val);
22238
22239   if (!aarch64_reg_or_zero (failval, mode))
22240     failval = copy_to_mode_reg (mode, failval);
22241
22242   emit_insn (gen_despeculate_copy (mode, result, val, failval));
22243   return result;
22244 }
22245
22246 /* Implement TARGET_ESTIMATED_POLY_VALUE.
22247    Look into the tuning structure for an estimate.
22248    VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
22249    Advanced SIMD 128 bits.  */
22250
22251 static HOST_WIDE_INT
22252 aarch64_estimated_poly_value (poly_int64 val)
22253 {
22254   enum aarch64_sve_vector_bits_enum width_source
22255     = aarch64_tune_params.sve_width;
22256
22257   /* If we still don't have an estimate, use the default.  */
22258   if (width_source == SVE_SCALABLE)
22259     return default_estimated_poly_value (val);
22260
22261   HOST_WIDE_INT over_128 = width_source - 128;
22262   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
22263 }
22264
22265
22266 /* Return true for types that could be supported as SIMD return or
22267    argument types.  */
22268
22269 static bool
22270 supported_simd_type (tree t)
22271 {
22272   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
22273     {
22274       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
22275       return s == 1 || s == 2 || s == 4 || s == 8;
22276     }
22277   return false;
22278 }
22279
22280 /* Return true for types that currently are supported as SIMD return
22281    or argument types.  */
22282
22283 static bool
22284 currently_supported_simd_type (tree t, tree b)
22285 {
22286   if (COMPLEX_FLOAT_TYPE_P (t))
22287     return false;
22288
22289   if (TYPE_SIZE (t) != TYPE_SIZE (b))
22290     return false;
22291
22292   return supported_simd_type (t);
22293 }
22294
22295 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
22296
22297 static int
22298 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
22299                                         struct cgraph_simd_clone *clonei,
22300                                         tree base_type, int num)
22301 {
22302   tree t, ret_type, arg_type;
22303   unsigned int elt_bits, vec_bits, count;
22304
22305   if (!TARGET_SIMD)
22306     return 0;
22307
22308   if (clonei->simdlen
22309       && (clonei->simdlen < 2
22310           || clonei->simdlen > 1024
22311           || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
22312     {
22313       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22314                   "unsupported simdlen %d", clonei->simdlen);
22315       return 0;
22316     }
22317
22318   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
22319   if (TREE_CODE (ret_type) != VOID_TYPE
22320       && !currently_supported_simd_type (ret_type, base_type))
22321     {
22322       if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
22323         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22324                     "GCC does not currently support mixed size types "
22325                     "for %<simd%> functions");
22326       else if (supported_simd_type (ret_type))
22327         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22328                     "GCC does not currently support return type %qT "
22329                     "for %<simd%> functions", ret_type);
22330       else
22331         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22332                     "unsupported return type %qT for %<simd%> functions",
22333                     ret_type);
22334       return 0;
22335     }
22336
22337   for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
22338     {
22339       arg_type = TREE_TYPE (t);
22340
22341       if (!currently_supported_simd_type (arg_type, base_type))
22342         {
22343           if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
22344             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22345                         "GCC does not currently support mixed size types "
22346                         "for %<simd%> functions");
22347           else
22348             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22349                         "GCC does not currently support argument type %qT "
22350                         "for %<simd%> functions", arg_type);
22351           return 0;
22352         }
22353     }
22354
22355   clonei->vecsize_mangle = 'n';
22356   clonei->mask_mode = VOIDmode;
22357   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
22358   if (clonei->simdlen == 0)
22359     {
22360       count = 2;
22361       vec_bits = (num == 0 ? 64 : 128);
22362       clonei->simdlen = vec_bits / elt_bits;
22363     }
22364   else
22365     {
22366       count = 1;
22367       vec_bits = clonei->simdlen * elt_bits;
22368       if (vec_bits != 64 && vec_bits != 128)
22369         {
22370           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22371                       "GCC does not currently support simdlen %d for type %qT",
22372                       clonei->simdlen, base_type);
22373           return 0;
22374         }
22375     }
22376   clonei->vecsize_int = vec_bits;
22377   clonei->vecsize_float = vec_bits;
22378   return count;
22379 }
22380
22381 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
22382
22383 static void
22384 aarch64_simd_clone_adjust (struct cgraph_node *node)
22385 {
22386   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
22387      use the correct ABI.  */
22388
22389   tree t = TREE_TYPE (node->decl);
22390   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
22391                                         TYPE_ATTRIBUTES (t));
22392 }
22393
22394 /* Implement TARGET_SIMD_CLONE_USABLE.  */
22395
22396 static int
22397 aarch64_simd_clone_usable (struct cgraph_node *node)
22398 {
22399   switch (node->simdclone->vecsize_mangle)
22400     {
22401     case 'n':
22402       if (!TARGET_SIMD)
22403         return -1;
22404       return 0;
22405     default:
22406       gcc_unreachable ();
22407     }
22408 }
22409
22410 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
22411
22412 static int
22413 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
22414 {
22415   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
22416       != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
22417     return 0;
22418   return 1;
22419 }
22420
22421 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
22422
22423 static const char *
22424 aarch64_get_multilib_abi_name (void)
22425 {
22426   if (TARGET_BIG_END)
22427     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
22428   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
22429 }
22430
22431 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
22432    global variable based guard use the default else
22433    return a null tree.  */
22434 static tree
22435 aarch64_stack_protect_guard (void)
22436 {
22437   if (aarch64_stack_protector_guard == SSP_GLOBAL)
22438     return default_stack_protect_guard ();
22439
22440   return NULL_TREE;
22441 }
22442
22443 /* Return the diagnostic message string if conversion from FROMTYPE to
22444    TOTYPE is not allowed, NULL otherwise.  */
22445
22446 static const char *
22447 aarch64_invalid_conversion (const_tree fromtype, const_tree totype)
22448 {
22449   if (element_mode (fromtype) != element_mode (totype))
22450     {
22451       /* Do no allow conversions to/from BFmode scalar types.  */
22452       if (TYPE_MODE (fromtype) == BFmode)
22453         return N_("invalid conversion from type %<bfloat16_t%>");
22454       if (TYPE_MODE (totype) == BFmode)
22455         return N_("invalid conversion to type %<bfloat16_t%>");
22456     }
22457
22458   /* Conversion allowed.  */
22459   return NULL;
22460 }
22461
22462 /* Return the diagnostic message string if the unary operation OP is
22463    not permitted on TYPE, NULL otherwise.  */
22464
22465 static const char *
22466 aarch64_invalid_unary_op (int op, const_tree type)
22467 {
22468   /* Reject all single-operand operations on BFmode except for &.  */
22469   if (element_mode (type) == BFmode && op != ADDR_EXPR)
22470     return N_("operation not permitted on type %<bfloat16_t%>");
22471
22472   /* Operation allowed.  */
22473   return NULL;
22474 }
22475
22476 /* Return the diagnostic message string if the binary operation OP is
22477    not permitted on TYPE1 and TYPE2, NULL otherwise.  */
22478
22479 static const char *
22480 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
22481                            const_tree type2)
22482 {
22483   /* Reject all 2-operand operations on BFmode.  */
22484   if (element_mode (type1) == BFmode
22485       || element_mode (type2) == BFmode)
22486     return N_("operation not permitted on type %<bfloat16_t%>");
22487
22488   if (VECTOR_TYPE_P (type1)
22489       && VECTOR_TYPE_P (type2)
22490       && !TYPE_INDIVISIBLE_P (type1)
22491       && !TYPE_INDIVISIBLE_P (type2)
22492       && (aarch64_sve::builtin_type_p (type1)
22493           != aarch64_sve::builtin_type_p (type2)))
22494     return N_("cannot combine GNU and SVE vectors in a binary operation");
22495
22496   /* Operation allowed.  */
22497   return NULL;
22498 }
22499
22500 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
22501    section at the end if needed.  */
22502 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND      0xc0000000
22503 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI      (1U << 0)
22504 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC      (1U << 1)
22505 void
22506 aarch64_file_end_indicate_exec_stack ()
22507 {
22508   file_end_indicate_exec_stack ();
22509
22510   unsigned feature_1_and = 0;
22511   if (aarch64_bti_enabled ())
22512     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
22513
22514   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
22515     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
22516
22517   if (feature_1_and)
22518     {
22519       /* Generate .note.gnu.property section.  */
22520       switch_to_section (get_section (".note.gnu.property",
22521                                       SECTION_NOTYPE, NULL));
22522
22523       /* PT_NOTE header: namesz, descsz, type.
22524          namesz = 4 ("GNU\0")
22525          descsz = 16 (Size of the program property array)
22526                   [(12 + padding) * Number of array elements]
22527          type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
22528       assemble_align (POINTER_SIZE);
22529       assemble_integer (GEN_INT (4), 4, 32, 1);
22530       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
22531       assemble_integer (GEN_INT (5), 4, 32, 1);
22532
22533       /* PT_NOTE name.  */
22534       assemble_string ("GNU", 4);
22535
22536       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
22537          type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
22538          datasz = 4
22539          data   = feature_1_and.  */
22540       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
22541       assemble_integer (GEN_INT (4), 4, 32, 1);
22542       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
22543
22544       /* Pad the size of the note to the required alignment.  */
22545       assemble_align (POINTER_SIZE);
22546     }
22547 }
22548 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
22549 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
22550 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
22551
22552 /* Target-specific selftests.  */
22553
22554 #if CHECKING_P
22555
22556 namespace selftest {
22557
22558 /* Selftest for the RTL loader.
22559    Verify that the RTL loader copes with a dump from
22560    print_rtx_function.  This is essentially just a test that class
22561    function_reader can handle a real dump, but it also verifies
22562    that lookup_reg_by_dump_name correctly handles hard regs.
22563    The presence of hard reg names in the dump means that the test is
22564    target-specific, hence it is in this file.  */
22565
22566 static void
22567 aarch64_test_loading_full_dump ()
22568 {
22569   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
22570
22571   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
22572
22573   rtx_insn *insn_1 = get_insn_by_uid (1);
22574   ASSERT_EQ (NOTE, GET_CODE (insn_1));
22575
22576   rtx_insn *insn_15 = get_insn_by_uid (15);
22577   ASSERT_EQ (INSN, GET_CODE (insn_15));
22578   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
22579
22580   /* Verify crtl->return_rtx.  */
22581   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
22582   ASSERT_EQ (0, REGNO (crtl->return_rtx));
22583   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
22584 }
22585
22586 /* Run all target-specific selftests.  */
22587
22588 static void
22589 aarch64_run_selftests (void)
22590 {
22591   aarch64_test_loading_full_dump ();
22592 }
22593
22594 } // namespace selftest
22595
22596 #endif /* #if CHECKING_P */
22597
22598 #undef TARGET_STACK_PROTECT_GUARD
22599 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
22600
22601 #undef TARGET_ADDRESS_COST
22602 #define TARGET_ADDRESS_COST aarch64_address_cost
22603
22604 /* This hook will determines whether unnamed bitfields affect the alignment
22605    of the containing structure.  The hook returns true if the structure
22606    should inherit the alignment requirements of an unnamed bitfield's
22607    type.  */
22608 #undef TARGET_ALIGN_ANON_BITFIELD
22609 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
22610
22611 #undef TARGET_ASM_ALIGNED_DI_OP
22612 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
22613
22614 #undef TARGET_ASM_ALIGNED_HI_OP
22615 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
22616
22617 #undef TARGET_ASM_ALIGNED_SI_OP
22618 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
22619
22620 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
22621 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
22622   hook_bool_const_tree_hwi_hwi_const_tree_true
22623
22624 #undef TARGET_ASM_FILE_START
22625 #define TARGET_ASM_FILE_START aarch64_start_file
22626
22627 #undef TARGET_ASM_OUTPUT_MI_THUNK
22628 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
22629
22630 #undef TARGET_ASM_SELECT_RTX_SECTION
22631 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
22632
22633 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
22634 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
22635
22636 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
22637 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
22638
22639 #undef TARGET_BUILD_BUILTIN_VA_LIST
22640 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
22641
22642 #undef TARGET_CALLEE_COPIES
22643 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
22644
22645 #undef TARGET_CAN_ELIMINATE
22646 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
22647
22648 #undef TARGET_CAN_INLINE_P
22649 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
22650
22651 #undef TARGET_CANNOT_FORCE_CONST_MEM
22652 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
22653
22654 #undef TARGET_CASE_VALUES_THRESHOLD
22655 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
22656
22657 #undef TARGET_CONDITIONAL_REGISTER_USAGE
22658 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
22659
22660 #undef TARGET_MEMBER_TYPE_FORCES_BLK
22661 #define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
22662
22663 /* Only the least significant bit is used for initialization guard
22664    variables.  */
22665 #undef TARGET_CXX_GUARD_MASK_BIT
22666 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
22667
22668 #undef TARGET_C_MODE_FOR_SUFFIX
22669 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
22670
22671 #ifdef TARGET_BIG_ENDIAN_DEFAULT
22672 #undef  TARGET_DEFAULT_TARGET_FLAGS
22673 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
22674 #endif
22675
22676 #undef TARGET_CLASS_MAX_NREGS
22677 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
22678
22679 #undef TARGET_BUILTIN_DECL
22680 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
22681
22682 #undef TARGET_BUILTIN_RECIPROCAL
22683 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
22684
22685 #undef TARGET_C_EXCESS_PRECISION
22686 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
22687
22688 #undef  TARGET_EXPAND_BUILTIN
22689 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
22690
22691 #undef TARGET_EXPAND_BUILTIN_VA_START
22692 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
22693
22694 #undef TARGET_FOLD_BUILTIN
22695 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
22696
22697 #undef TARGET_FUNCTION_ARG
22698 #define TARGET_FUNCTION_ARG aarch64_function_arg
22699
22700 #undef TARGET_FUNCTION_ARG_ADVANCE
22701 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
22702
22703 #undef TARGET_FUNCTION_ARG_BOUNDARY
22704 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
22705
22706 #undef TARGET_FUNCTION_ARG_PADDING
22707 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
22708
22709 #undef TARGET_GET_RAW_RESULT_MODE
22710 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
22711 #undef TARGET_GET_RAW_ARG_MODE
22712 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
22713
22714 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
22715 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
22716
22717 #undef TARGET_FUNCTION_VALUE
22718 #define TARGET_FUNCTION_VALUE aarch64_function_value
22719
22720 #undef TARGET_FUNCTION_VALUE_REGNO_P
22721 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
22722
22723 #undef TARGET_GIMPLE_FOLD_BUILTIN
22724 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
22725
22726 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
22727 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
22728
22729 #undef  TARGET_INIT_BUILTINS
22730 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
22731
22732 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
22733 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
22734   aarch64_ira_change_pseudo_allocno_class
22735
22736 #undef TARGET_LEGITIMATE_ADDRESS_P
22737 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
22738
22739 #undef TARGET_LEGITIMATE_CONSTANT_P
22740 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
22741
22742 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
22743 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
22744   aarch64_legitimize_address_displacement
22745
22746 #undef TARGET_LIBGCC_CMP_RETURN_MODE
22747 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
22748
22749 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
22750 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
22751 aarch64_libgcc_floating_mode_supported_p
22752
22753 #undef TARGET_MANGLE_TYPE
22754 #define TARGET_MANGLE_TYPE aarch64_mangle_type
22755
22756 #undef TARGET_INVALID_CONVERSION
22757 #define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
22758
22759 #undef TARGET_INVALID_UNARY_OP
22760 #define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
22761
22762 #undef TARGET_INVALID_BINARY_OP
22763 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
22764
22765 #undef TARGET_VERIFY_TYPE_CONTEXT
22766 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
22767
22768 #undef TARGET_MEMORY_MOVE_COST
22769 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
22770
22771 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
22772 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
22773
22774 #undef TARGET_MUST_PASS_IN_STACK
22775 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
22776
22777 /* This target hook should return true if accesses to volatile bitfields
22778    should use the narrowest mode possible.  It should return false if these
22779    accesses should use the bitfield container type.  */
22780 #undef TARGET_NARROW_VOLATILE_BITFIELD
22781 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
22782
22783 #undef  TARGET_OPTION_OVERRIDE
22784 #define TARGET_OPTION_OVERRIDE aarch64_override_options
22785
22786 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
22787 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
22788   aarch64_override_options_after_change
22789
22790 #undef TARGET_OPTION_SAVE
22791 #define TARGET_OPTION_SAVE aarch64_option_save
22792
22793 #undef TARGET_OPTION_RESTORE
22794 #define TARGET_OPTION_RESTORE aarch64_option_restore
22795
22796 #undef TARGET_OPTION_PRINT
22797 #define TARGET_OPTION_PRINT aarch64_option_print
22798
22799 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
22800 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
22801
22802 #undef TARGET_SET_CURRENT_FUNCTION
22803 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
22804
22805 #undef TARGET_PASS_BY_REFERENCE
22806 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
22807
22808 #undef TARGET_PREFERRED_RELOAD_CLASS
22809 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
22810
22811 #undef TARGET_SCHED_REASSOCIATION_WIDTH
22812 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
22813
22814 #undef TARGET_PROMOTED_TYPE
22815 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
22816
22817 #undef TARGET_SECONDARY_RELOAD
22818 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
22819
22820 #undef TARGET_SHIFT_TRUNCATION_MASK
22821 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
22822
22823 #undef TARGET_SETUP_INCOMING_VARARGS
22824 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
22825
22826 #undef TARGET_STRUCT_VALUE_RTX
22827 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
22828
22829 #undef TARGET_REGISTER_MOVE_COST
22830 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
22831
22832 #undef TARGET_RETURN_IN_MEMORY
22833 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
22834
22835 #undef TARGET_RETURN_IN_MSB
22836 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
22837
22838 #undef TARGET_RTX_COSTS
22839 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
22840
22841 #undef TARGET_SCALAR_MODE_SUPPORTED_P
22842 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
22843
22844 #undef TARGET_SCHED_ISSUE_RATE
22845 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
22846
22847 #undef TARGET_SCHED_VARIABLE_ISSUE
22848 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
22849
22850 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
22851 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
22852   aarch64_sched_first_cycle_multipass_dfa_lookahead
22853
22854 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
22855 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
22856   aarch64_first_cycle_multipass_dfa_lookahead_guard
22857
22858 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
22859 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
22860   aarch64_get_separate_components
22861
22862 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
22863 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
22864   aarch64_components_for_bb
22865
22866 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
22867 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
22868   aarch64_disqualify_components
22869
22870 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
22871 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
22872   aarch64_emit_prologue_components
22873
22874 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
22875 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
22876   aarch64_emit_epilogue_components
22877
22878 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
22879 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
22880   aarch64_set_handled_components
22881
22882 #undef TARGET_TRAMPOLINE_INIT
22883 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
22884
22885 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
22886 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
22887
22888 #undef TARGET_VECTOR_MODE_SUPPORTED_P
22889 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
22890
22891 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
22892 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
22893
22894 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
22895 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
22896   aarch64_builtin_support_vector_misalignment
22897
22898 #undef TARGET_ARRAY_MODE
22899 #define TARGET_ARRAY_MODE aarch64_array_mode
22900
22901 #undef TARGET_ARRAY_MODE_SUPPORTED_P
22902 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
22903
22904 #undef TARGET_VECTORIZE_ADD_STMT_COST
22905 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
22906
22907 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
22908 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
22909   aarch64_builtin_vectorization_cost
22910
22911 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
22912 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
22913
22914 #undef TARGET_VECTORIZE_BUILTINS
22915 #define TARGET_VECTORIZE_BUILTINS
22916
22917 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
22918 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
22919   aarch64_builtin_vectorized_function
22920
22921 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
22922 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
22923   aarch64_autovectorize_vector_modes
22924
22925 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
22926 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
22927   aarch64_atomic_assign_expand_fenv
22928
22929 /* Section anchor support.  */
22930
22931 #undef TARGET_MIN_ANCHOR_OFFSET
22932 #define TARGET_MIN_ANCHOR_OFFSET -256
22933
22934 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
22935    byte offset; we can do much more for larger data types, but have no way
22936    to determine the size of the access.  We assume accesses are aligned.  */
22937 #undef TARGET_MAX_ANCHOR_OFFSET
22938 #define TARGET_MAX_ANCHOR_OFFSET 4095
22939
22940 #undef TARGET_VECTOR_ALIGNMENT
22941 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
22942
22943 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
22944 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
22945   aarch64_vectorize_preferred_vector_alignment
22946 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
22947 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
22948   aarch64_simd_vector_alignment_reachable
22949
22950 /* vec_perm support.  */
22951
22952 #undef TARGET_VECTORIZE_VEC_PERM_CONST
22953 #define TARGET_VECTORIZE_VEC_PERM_CONST \
22954   aarch64_vectorize_vec_perm_const
22955
22956 #undef TARGET_VECTORIZE_RELATED_MODE
22957 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
22958 #undef TARGET_VECTORIZE_GET_MASK_MODE
22959 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
22960 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
22961 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
22962   aarch64_empty_mask_is_expensive
22963 #undef TARGET_PREFERRED_ELSE_VALUE
22964 #define TARGET_PREFERRED_ELSE_VALUE \
22965   aarch64_preferred_else_value
22966
22967 #undef TARGET_INIT_LIBFUNCS
22968 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
22969
22970 #undef TARGET_FIXED_CONDITION_CODE_REGS
22971 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
22972
22973 #undef TARGET_FLAGS_REGNUM
22974 #define TARGET_FLAGS_REGNUM CC_REGNUM
22975
22976 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
22977 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
22978
22979 #undef TARGET_ASAN_SHADOW_OFFSET
22980 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
22981
22982 #undef TARGET_LEGITIMIZE_ADDRESS
22983 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
22984
22985 #undef TARGET_SCHED_CAN_SPECULATE_INSN
22986 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
22987
22988 #undef TARGET_CAN_USE_DOLOOP_P
22989 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
22990
22991 #undef TARGET_SCHED_ADJUST_PRIORITY
22992 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
22993
22994 #undef TARGET_SCHED_MACRO_FUSION_P
22995 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
22996
22997 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
22998 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
22999
23000 #undef TARGET_SCHED_FUSION_PRIORITY
23001 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
23002
23003 #undef TARGET_UNSPEC_MAY_TRAP_P
23004 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
23005
23006 #undef TARGET_USE_PSEUDO_PIC_REG
23007 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
23008
23009 #undef TARGET_PRINT_OPERAND
23010 #define TARGET_PRINT_OPERAND aarch64_print_operand
23011
23012 #undef TARGET_PRINT_OPERAND_ADDRESS
23013 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
23014
23015 #undef TARGET_OPTAB_SUPPORTED_P
23016 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
23017
23018 #undef TARGET_OMIT_STRUCT_RETURN_REG
23019 #define TARGET_OMIT_STRUCT_RETURN_REG true
23020
23021 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
23022 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
23023   aarch64_dwarf_poly_indeterminate_value
23024
23025 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
23026 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
23027 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
23028
23029 #undef TARGET_HARD_REGNO_NREGS
23030 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
23031 #undef TARGET_HARD_REGNO_MODE_OK
23032 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
23033
23034 #undef TARGET_MODES_TIEABLE_P
23035 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
23036
23037 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
23038 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
23039   aarch64_hard_regno_call_part_clobbered
23040
23041 #undef TARGET_INSN_CALLEE_ABI
23042 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
23043
23044 #undef TARGET_CONSTANT_ALIGNMENT
23045 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
23046
23047 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
23048 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
23049   aarch64_stack_clash_protection_alloca_probe_range
23050
23051 #undef TARGET_COMPUTE_PRESSURE_CLASSES
23052 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
23053
23054 #undef TARGET_CAN_CHANGE_MODE_CLASS
23055 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
23056
23057 #undef TARGET_SELECT_EARLY_REMAT_MODES
23058 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
23059
23060 #undef TARGET_SPECULATION_SAFE_VALUE
23061 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
23062
23063 #undef TARGET_ESTIMATED_POLY_VALUE
23064 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
23065
23066 #undef TARGET_ATTRIBUTE_TABLE
23067 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
23068
23069 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
23070 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
23071   aarch64_simd_clone_compute_vecsize_and_simdlen
23072
23073 #undef TARGET_SIMD_CLONE_ADJUST
23074 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
23075
23076 #undef TARGET_SIMD_CLONE_USABLE
23077 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
23078
23079 #undef TARGET_COMP_TYPE_ATTRIBUTES
23080 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
23081
23082 #undef TARGET_GET_MULTILIB_ABI_NAME
23083 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
23084
23085 #undef TARGET_FNTYPE_ABI
23086 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
23087
23088 #if CHECKING_P
23089 #undef TARGET_RUN_TARGET_SELFTESTS
23090 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
23091 #endif /* #if CHECKING_P */
23092
23093 #undef TARGET_ASM_POST_CFI_STARTPROC
23094 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
23095
23096 #undef TARGET_STRICT_ARGUMENT_NAMING
23097 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
23098
23099 #undef TARGET_MD_ASM_ADJUST
23100 #define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
23101
23102 struct gcc_target targetm = TARGET_INITIALIZER;
23103
23104 #include "gt-aarch64.h"