gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2020 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "cgraph.h"
  44 #include "diagnostic.h"
  45 #include "insn-attr.h"
  46 #include "alias.h"
  47 #include "fold-const.h"
  48 #include "stor-layout.h"
  49 #include "calls.h"
  50 #include "varasm.h"
  51 #include "output.h"
  52 #include "flags.h"
  53 #include "explow.h"
  54 #include "expr.h"
  55 #include "reload.h"
  56 #include "langhooks.h"
  57 #include "opts.h"
  58 #include "gimplify.h"
  59 #include "dwarf2.h"
  60 #include "gimple-iterator.h"
  61 #include "tree-vectorizer.h"
  62 #include "aarch64-cost-tables.h"
  63 #include "dumpfile.h"
  64 #include "builtins.h"
  65 #include "rtl-iter.h"
  66 #include "tm-constrs.h"
  67 #include "sched-int.h"
  68 #include "target-globals.h"
  69 #include "common/common-target.h"
  70 #include "cfgrtl.h"
  71 #include "selftest.h"
  72 #include "selftest-rtl.h"
  73 #include "rtx-vector-builder.h"
  74 #include "intl.h"
  75 #include "expmed.h"
  76 #include "function-abi.h"
  77
  78 /* This file should be included last.  */
  79 #include "target-def.h"
  80
  81 /* Defined for convenience.  */
  82 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  83
  84 /* Information about a legitimate vector immediate operand.  */
  85 struct simd_immediate_info
  86 {
  87   enum insn_type { MOV, MVN, INDEX, PTRUE };
  88   enum modifier_type { LSL, MSL };
  89
  90   simd_immediate_info () {}
  91   simd_immediate_info (scalar_float_mode, rtx);
  92   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
  93                        insn_type = MOV, modifier_type = LSL,
  94                        unsigned int = 0);
  95   simd_immediate_info (scalar_mode, rtx, rtx);
  96   simd_immediate_info (scalar_int_mode, aarch64_svpattern);
  97
  98   /* The mode of the elements.  */
  99   scalar_mode elt_mode;
 100
 101   /* The instruction to use to move the immediate into a vector.  */
 102   insn_type insn;
 103
 104   union
 105   {
 106     /* For MOV and MVN.  */
 107     struct
 108     {
 109       /* The value of each element.  */
 110       rtx value;
 111
 112       /* The kind of shift modifier to use, and the number of bits to shift.
 113          This is (LSL, 0) if no shift is needed.  */
 114       modifier_type modifier;
 115       unsigned int shift;
 116     } mov;
 117
 118     /* For INDEX.  */
 119     struct
 120     {
 121       /* The value of the first element and the step to be added for each
 122          subsequent element.  */
 123       rtx base, step;
 124     } index;
 125
 126     /* For PTRUE.  */
 127     aarch64_svpattern pattern;
 128   } u;
 129 };
 130
 131 /* Construct a floating-point immediate in which each element has mode
 132    ELT_MODE_IN and value VALUE_IN.  */
 133 inline simd_immediate_info
 134 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 135   : elt_mode (elt_mode_in), insn (MOV)
 136 {
 137   u.mov.value = value_in;
 138   u.mov.modifier = LSL;
 139   u.mov.shift = 0;
 140 }
 141
 142 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 143    and value VALUE_IN.  The other parameters are as for the structure
 144    fields.  */
 145 inline simd_immediate_info
 146 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 147                        unsigned HOST_WIDE_INT value_in,
 148                        insn_type insn_in, modifier_type modifier_in,
 149                        unsigned int shift_in)
 150   : elt_mode (elt_mode_in), insn (insn_in)
 151 {
 152   u.mov.value = gen_int_mode (value_in, elt_mode_in);
 153   u.mov.modifier = modifier_in;
 154   u.mov.shift = shift_in;
 155 }
 156
 157 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 158    and where element I is equal to BASE_IN + I * STEP_IN.  */
 159 inline simd_immediate_info
 160 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
 161   : elt_mode (elt_mode_in), insn (INDEX)
 162 {
 163   u.index.base = base_in;
 164   u.index.step = step_in;
 165 }
 166
 167 /* Construct a predicate that controls elements of mode ELT_MODE_IN
 168    and has PTRUE pattern PATTERN_IN.  */
 169 inline simd_immediate_info
 170 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 171                        aarch64_svpattern pattern_in)
 172   : elt_mode (elt_mode_in), insn (PTRUE)
 173 {
 174   u.pattern = pattern_in;
 175 }
 176
 177 /* The current code model.  */
 178 enum aarch64_code_model aarch64_cmodel;
 179
 180 /* The number of 64-bit elements in an SVE vector.  */
 181 poly_uint16 aarch64_sve_vg;
 182
 183 #ifdef HAVE_AS_TLS
 184 #undef TARGET_HAVE_TLS
 185 #define TARGET_HAVE_TLS 1
 186 #endif
 187
 188 static bool aarch64_composite_type_p (const_tree, machine_mode);
 189 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 190                                                      const_tree,
 191                                                      machine_mode *, int *,
 192                                                      bool *);
 193 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 194 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 195 static void aarch64_override_options_after_change (void);
 196 static bool aarch64_vector_mode_supported_p (machine_mode);
 197 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 198 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 199                                                          const_tree type,
 200                                                          int misalignment,
 201                                                          bool is_packed);
 202 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 203 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 204                                             aarch64_addr_query_type);
 205 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
 206
 207 /* Major revision number of the ARM Architecture implemented by the target.  */
 208 unsigned aarch64_architecture_version;
 209
 210 /* The processor for which instructions should be scheduled.  */
 211 enum aarch64_processor aarch64_tune = cortexa53;
 212
 213 /* Mask to specify which instruction scheduling options should be used.  */
 214 uint64_t aarch64_tune_flags = 0;
 215
 216 /* Global flag for PC relative loads.  */
 217 bool aarch64_pcrelative_literal_loads;
 218
 219 /* Global flag for whether frame pointer is enabled.  */
 220 bool aarch64_use_frame_pointer;
 221
 222 #define BRANCH_PROTECT_STR_MAX 255
 223 char *accepted_branch_protection_string = NULL;
 224
 225 static enum aarch64_parse_opt_result
 226 aarch64_parse_branch_protection (const char*, char**);
 227
 228 /* Support for command line parsing of boolean flags in the tuning
 229    structures.  */
 230 struct aarch64_flag_desc
 231 {
 232   const char* name;
 233   unsigned int flag;
 234 };
 235
 236 #define AARCH64_FUSION_PAIR(name, internal_name) \
 237   { name, AARCH64_FUSE_##internal_name },
 238 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 239 {
 240   { "none", AARCH64_FUSE_NOTHING },
 241 #include "aarch64-fusion-pairs.def"
 242   { "all", AARCH64_FUSE_ALL },
 243   { NULL, AARCH64_FUSE_NOTHING }
 244 };
 245
 246 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 247   { name, AARCH64_EXTRA_TUNE_##internal_name },
 248 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 249 {
 250   { "none", AARCH64_EXTRA_TUNE_NONE },
 251 #include "aarch64-tuning-flags.def"
 252   { "all", AARCH64_EXTRA_TUNE_ALL },
 253   { NULL, AARCH64_EXTRA_TUNE_NONE }
 254 };
 255
 256 /* Tuning parameters.  */
 257
 258 static const struct cpu_addrcost_table generic_addrcost_table =
 259 {
 260     {
 261       1, /* hi  */
 262       0, /* si  */
 263       0, /* di  */
 264       1, /* ti  */
 265     },
 266   0, /* pre_modify  */
 267   0, /* post_modify  */
 268   0, /* register_offset  */
 269   0, /* register_sextend  */
 270   0, /* register_zextend  */
 271   0 /* imm_offset  */
 272 };
 273
 274 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 275 {
 276     {
 277       0, /* hi  */
 278       0, /* si  */
 279       0, /* di  */
 280       2, /* ti  */
 281     },
 282   0, /* pre_modify  */
 283   0, /* post_modify  */
 284   1, /* register_offset  */
 285   1, /* register_sextend  */
 286   2, /* register_zextend  */
 287   0, /* imm_offset  */
 288 };
 289
 290 static const struct cpu_addrcost_table xgene1_addrcost_table =
 291 {
 292     {
 293       1, /* hi  */
 294       0, /* si  */
 295       0, /* di  */
 296       1, /* ti  */
 297     },
 298   1, /* pre_modify  */
 299   1, /* post_modify  */
 300   0, /* register_offset  */
 301   1, /* register_sextend  */
 302   1, /* register_zextend  */
 303   0, /* imm_offset  */
 304 };
 305
 306 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 307 {
 308     {
 309       1, /* hi  */
 310       1, /* si  */
 311       1, /* di  */
 312       2, /* ti  */
 313     },
 314   0, /* pre_modify  */
 315   0, /* post_modify  */
 316   2, /* register_offset  */
 317   3, /* register_sextend  */
 318   3, /* register_zextend  */
 319   0, /* imm_offset  */
 320 };
 321
 322 static const struct cpu_addrcost_table tsv110_addrcost_table =
 323 {
 324     {
 325       1, /* hi  */
 326       0, /* si  */
 327       0, /* di  */
 328       1, /* ti  */
 329     },
 330   0, /* pre_modify  */
 331   0, /* post_modify  */
 332   0, /* register_offset  */
 333   1, /* register_sextend  */
 334   1, /* register_zextend  */
 335   0, /* imm_offset  */
 336 };
 337
 338 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 339 {
 340     {
 341       1, /* hi  */
 342       1, /* si  */
 343       1, /* di  */
 344       2, /* ti  */
 345     },
 346   1, /* pre_modify  */
 347   1, /* post_modify  */
 348   3, /* register_offset  */
 349   3, /* register_sextend  */
 350   3, /* register_zextend  */
 351   2, /* imm_offset  */
 352 };
 353
 354 static const struct cpu_regmove_cost generic_regmove_cost =
 355 {
 356   1, /* GP2GP  */
 357   /* Avoid the use of slow int<->fp moves for spilling by setting
 358      their cost higher than memmov_cost.  */
 359   5, /* GP2FP  */
 360   5, /* FP2GP  */
 361   2 /* FP2FP  */
 362 };
 363
 364 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 365 {
 366   1, /* GP2GP  */
 367   /* Avoid the use of slow int<->fp moves for spilling by setting
 368      their cost higher than memmov_cost.  */
 369   5, /* GP2FP  */
 370   5, /* FP2GP  */
 371   2 /* FP2FP  */
 372 };
 373
 374 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 375 {
 376   1, /* GP2GP  */
 377   /* Avoid the use of slow int<->fp moves for spilling by setting
 378      their cost higher than memmov_cost.  */
 379   5, /* GP2FP  */
 380   5, /* FP2GP  */
 381   2 /* FP2FP  */
 382 };
 383
 384 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 385 {
 386   1, /* GP2GP  */
 387   /* Avoid the use of slow int<->fp moves for spilling by setting
 388      their cost higher than memmov_cost (actual, 4 and 9).  */
 389   9, /* GP2FP  */
 390   9, /* FP2GP  */
 391   1 /* FP2FP  */
 392 };
 393
 394 static const struct cpu_regmove_cost thunderx_regmove_cost =
 395 {
 396   2, /* GP2GP  */
 397   2, /* GP2FP  */
 398   6, /* FP2GP  */
 399   4 /* FP2FP  */
 400 };
 401
 402 static const struct cpu_regmove_cost xgene1_regmove_cost =
 403 {
 404   1, /* GP2GP  */
 405   /* Avoid the use of slow int<->fp moves for spilling by setting
 406      their cost higher than memmov_cost.  */
 407   8, /* GP2FP  */
 408   8, /* FP2GP  */
 409   2 /* FP2FP  */
 410 };
 411
 412 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 413 {
 414   2, /* GP2GP  */
 415   /* Avoid the use of int<->fp moves for spilling.  */
 416   6, /* GP2FP  */
 417   6, /* FP2GP  */
 418   4 /* FP2FP  */
 419 };
 420
 421 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 422 {
 423   1, /* GP2GP  */
 424   /* Avoid the use of int<->fp moves for spilling.  */
 425   8, /* GP2FP  */
 426   8, /* FP2GP  */
 427   4  /* FP2FP  */
 428 };
 429
 430 static const struct cpu_regmove_cost tsv110_regmove_cost =
 431 {
 432   1, /* GP2GP  */
 433   /* Avoid the use of slow int<->fp moves for spilling by setting
 434      their cost higher than memmov_cost.  */
 435   2, /* GP2FP  */
 436   3, /* FP2GP  */
 437   2  /* FP2FP  */
 438 };
 439
 440 /* Generic costs for vector insn classes.  */
 441 static const struct cpu_vector_cost generic_vector_cost =
 442 {
 443   1, /* scalar_int_stmt_cost  */
 444   1, /* scalar_fp_stmt_cost  */
 445   1, /* scalar_load_cost  */
 446   1, /* scalar_store_cost  */
 447   1, /* vec_int_stmt_cost  */
 448   1, /* vec_fp_stmt_cost  */
 449   2, /* vec_permute_cost  */
 450   2, /* vec_to_scalar_cost  */
 451   1, /* scalar_to_vec_cost  */
 452   1, /* vec_align_load_cost  */
 453   1, /* vec_unalign_load_cost  */
 454   1, /* vec_unalign_store_cost  */
 455   1, /* vec_store_cost  */
 456   3, /* cond_taken_branch_cost  */
 457   1 /* cond_not_taken_branch_cost  */
 458 };
 459
 460 /* QDF24XX costs for vector insn classes.  */
 461 static const struct cpu_vector_cost qdf24xx_vector_cost =
 462 {
 463   1, /* scalar_int_stmt_cost  */
 464   1, /* scalar_fp_stmt_cost  */
 465   1, /* scalar_load_cost  */
 466   1, /* scalar_store_cost  */
 467   1, /* vec_int_stmt_cost  */
 468   3, /* vec_fp_stmt_cost  */
 469   2, /* vec_permute_cost  */
 470   1, /* vec_to_scalar_cost  */
 471   1, /* scalar_to_vec_cost  */
 472   1, /* vec_align_load_cost  */
 473   1, /* vec_unalign_load_cost  */
 474   1, /* vec_unalign_store_cost  */
 475   1, /* vec_store_cost  */
 476   3, /* cond_taken_branch_cost  */
 477   1 /* cond_not_taken_branch_cost  */
 478 };
 479
 480 /* ThunderX costs for vector insn classes.  */
 481 static const struct cpu_vector_cost thunderx_vector_cost =
 482 {
 483   1, /* scalar_int_stmt_cost  */
 484   1, /* scalar_fp_stmt_cost  */
 485   3, /* scalar_load_cost  */
 486   1, /* scalar_store_cost  */
 487   4, /* vec_int_stmt_cost  */
 488   1, /* vec_fp_stmt_cost  */
 489   4, /* vec_permute_cost  */
 490   2, /* vec_to_scalar_cost  */
 491   2, /* scalar_to_vec_cost  */
 492   3, /* vec_align_load_cost  */
 493   5, /* vec_unalign_load_cost  */
 494   5, /* vec_unalign_store_cost  */
 495   1, /* vec_store_cost  */
 496   3, /* cond_taken_branch_cost  */
 497   3 /* cond_not_taken_branch_cost  */
 498 };
 499
 500 static const struct cpu_vector_cost tsv110_vector_cost =
 501 {
 502   1, /* scalar_int_stmt_cost  */
 503   1, /* scalar_fp_stmt_cost  */
 504   5, /* scalar_load_cost  */
 505   1, /* scalar_store_cost  */
 506   2, /* vec_int_stmt_cost  */
 507   2, /* vec_fp_stmt_cost  */
 508   2, /* vec_permute_cost  */
 509   3, /* vec_to_scalar_cost  */
 510   2, /* scalar_to_vec_cost  */
 511   5, /* vec_align_load_cost  */
 512   5, /* vec_unalign_load_cost  */
 513   1, /* vec_unalign_store_cost  */
 514   1, /* vec_store_cost  */
 515   1, /* cond_taken_branch_cost  */
 516   1 /* cond_not_taken_branch_cost  */
 517 };
 518
 519 /* Generic costs for vector insn classes.  */
 520 static const struct cpu_vector_cost cortexa57_vector_cost =
 521 {
 522   1, /* scalar_int_stmt_cost  */
 523   1, /* scalar_fp_stmt_cost  */
 524   4, /* scalar_load_cost  */
 525   1, /* scalar_store_cost  */
 526   2, /* vec_int_stmt_cost  */
 527   2, /* vec_fp_stmt_cost  */
 528   3, /* vec_permute_cost  */
 529   8, /* vec_to_scalar_cost  */
 530   8, /* scalar_to_vec_cost  */
 531   4, /* vec_align_load_cost  */
 532   4, /* vec_unalign_load_cost  */
 533   1, /* vec_unalign_store_cost  */
 534   1, /* vec_store_cost  */
 535   1, /* cond_taken_branch_cost  */
 536   1 /* cond_not_taken_branch_cost  */
 537 };
 538
 539 static const struct cpu_vector_cost exynosm1_vector_cost =
 540 {
 541   1, /* scalar_int_stmt_cost  */
 542   1, /* scalar_fp_stmt_cost  */
 543   5, /* scalar_load_cost  */
 544   1, /* scalar_store_cost  */
 545   3, /* vec_int_stmt_cost  */
 546   3, /* vec_fp_stmt_cost  */
 547   3, /* vec_permute_cost  */
 548   3, /* vec_to_scalar_cost  */
 549   3, /* scalar_to_vec_cost  */
 550   5, /* vec_align_load_cost  */
 551   5, /* vec_unalign_load_cost  */
 552   1, /* vec_unalign_store_cost  */
 553   1, /* vec_store_cost  */
 554   1, /* cond_taken_branch_cost  */
 555   1 /* cond_not_taken_branch_cost  */
 556 };
 557
 558 /* Generic costs for vector insn classes.  */
 559 static const struct cpu_vector_cost xgene1_vector_cost =
 560 {
 561   1, /* scalar_int_stmt_cost  */
 562   1, /* scalar_fp_stmt_cost  */
 563   5, /* scalar_load_cost  */
 564   1, /* scalar_store_cost  */
 565   2, /* vec_int_stmt_cost  */
 566   2, /* vec_fp_stmt_cost  */
 567   2, /* vec_permute_cost  */
 568   4, /* vec_to_scalar_cost  */
 569   4, /* scalar_to_vec_cost  */
 570   10, /* vec_align_load_cost  */
 571   10, /* vec_unalign_load_cost  */
 572   2, /* vec_unalign_store_cost  */
 573   2, /* vec_store_cost  */
 574   2, /* cond_taken_branch_cost  */
 575   1 /* cond_not_taken_branch_cost  */
 576 };
 577
 578 /* Costs for vector insn classes for Vulcan.  */
 579 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 580 {
 581   1, /* scalar_int_stmt_cost  */
 582   6, /* scalar_fp_stmt_cost  */
 583   4, /* scalar_load_cost  */
 584   1, /* scalar_store_cost  */
 585   5, /* vec_int_stmt_cost  */
 586   6, /* vec_fp_stmt_cost  */
 587   10, /* vec_permute_cost  */
 588   6, /* vec_to_scalar_cost  */
 589   5, /* scalar_to_vec_cost  */
 590   8, /* vec_align_load_cost  */
 591   8, /* vec_unalign_load_cost  */
 592   4, /* vec_unalign_store_cost  */
 593   4, /* vec_store_cost  */
 594   2, /* cond_taken_branch_cost  */
 595   1  /* cond_not_taken_branch_cost  */
 596 };
 597
 598 /* Generic costs for branch instructions.  */
 599 static const struct cpu_branch_cost generic_branch_cost =
 600 {
 601   1,  /* Predictable.  */
 602   3   /* Unpredictable.  */
 603 };
 604
 605 /* Generic approximation modes.  */
 606 static const cpu_approx_modes generic_approx_modes =
 607 {
 608   AARCH64_APPROX_NONE,  /* division  */
 609   AARCH64_APPROX_NONE,  /* sqrt  */
 610   AARCH64_APPROX_NONE   /* recip_sqrt  */
 611 };
 612
 613 /* Approximation modes for Exynos M1.  */
 614 static const cpu_approx_modes exynosm1_approx_modes =
 615 {
 616   AARCH64_APPROX_NONE,  /* division  */
 617   AARCH64_APPROX_ALL,   /* sqrt  */
 618   AARCH64_APPROX_ALL    /* recip_sqrt  */
 619 };
 620
 621 /* Approximation modes for X-Gene 1.  */
 622 static const cpu_approx_modes xgene1_approx_modes =
 623 {
 624   AARCH64_APPROX_NONE,  /* division  */
 625   AARCH64_APPROX_NONE,  /* sqrt  */
 626   AARCH64_APPROX_ALL    /* recip_sqrt  */
 627 };
 628
 629 /* Generic prefetch settings (which disable prefetch).  */
 630 static const cpu_prefetch_tune generic_prefetch_tune =
 631 {
 632   0,                    /* num_slots  */
 633   -1,                   /* l1_cache_size  */
 634   -1,                   /* l1_cache_line_size  */
 635   -1,                   /* l2_cache_size  */
 636   true,                 /* prefetch_dynamic_strides */
 637   -1,                   /* minimum_stride */
 638   -1                    /* default_opt_level  */
 639 };
 640
 641 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 642 {
 643   0,                    /* num_slots  */
 644   -1,                   /* l1_cache_size  */
 645   64,                   /* l1_cache_line_size  */
 646   -1,                   /* l2_cache_size  */
 647   true,                 /* prefetch_dynamic_strides */
 648   -1,                   /* minimum_stride */
 649   -1                    /* default_opt_level  */
 650 };
 651
 652 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 653 {
 654   4,                    /* num_slots  */
 655   32,                   /* l1_cache_size  */
 656   64,                   /* l1_cache_line_size  */
 657   512,                  /* l2_cache_size  */
 658   false,                /* prefetch_dynamic_strides */
 659   2048,                 /* minimum_stride */
 660   3                     /* default_opt_level  */
 661 };
 662
 663 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 664 {
 665   8,                    /* num_slots  */
 666   32,                   /* l1_cache_size  */
 667   128,                  /* l1_cache_line_size  */
 668   16*1024,              /* l2_cache_size  */
 669   true,                 /* prefetch_dynamic_strides */
 670   -1,                   /* minimum_stride */
 671   3                     /* default_opt_level  */
 672 };
 673
 674 static const cpu_prefetch_tune thunderx_prefetch_tune =
 675 {
 676   8,                    /* num_slots  */
 677   32,                   /* l1_cache_size  */
 678   128,                  /* l1_cache_line_size  */
 679   -1,                   /* l2_cache_size  */
 680   true,                 /* prefetch_dynamic_strides */
 681   -1,                   /* minimum_stride */
 682   -1                    /* default_opt_level  */
 683 };
 684
 685 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 686 {
 687   8,                    /* num_slots  */
 688   32,                   /* l1_cache_size  */
 689   64,                   /* l1_cache_line_size  */
 690   256,                  /* l2_cache_size  */
 691   true,                 /* prefetch_dynamic_strides */
 692   -1,                   /* minimum_stride */
 693   -1                    /* default_opt_level  */
 694 };
 695
 696 static const cpu_prefetch_tune tsv110_prefetch_tune =
 697 {
 698   0,                    /* num_slots  */
 699   64,                   /* l1_cache_size  */
 700   64,                   /* l1_cache_line_size  */
 701   512,                  /* l2_cache_size  */
 702   true,                 /* prefetch_dynamic_strides */
 703   -1,                   /* minimum_stride */
 704   -1                    /* default_opt_level  */
 705 };
 706
 707 static const cpu_prefetch_tune xgene1_prefetch_tune =
 708 {
 709   8,                    /* num_slots  */
 710   32,                   /* l1_cache_size  */
 711   64,                   /* l1_cache_line_size  */
 712   256,                  /* l2_cache_size  */
 713   true,                 /* prefetch_dynamic_strides */
 714   -1,                   /* minimum_stride */
 715   -1                    /* default_opt_level  */
 716 };
 717
 718 static const struct tune_params generic_tunings =
 719 {
 720   &cortexa57_extra_costs,
 721   &generic_addrcost_table,
 722   &generic_regmove_cost,
 723   &generic_vector_cost,
 724   &generic_branch_cost,
 725   &generic_approx_modes,
 726   SVE_NOT_IMPLEMENTED, /* sve_width  */
 727   4, /* memmov_cost  */
 728   2, /* issue_rate  */
 729   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
 730   "16:12",      /* function_align.  */
 731   "4",  /* jump_align.  */
 732   "8",  /* loop_align.  */
 733   2,    /* int_reassoc_width.  */
 734   4,    /* fp_reassoc_width.  */
 735   1,    /* vec_reassoc_width.  */
 736   2,    /* min_div_recip_mul_sf.  */
 737   2,    /* min_div_recip_mul_df.  */
 738   0,    /* max_case_values.  */
 739   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 740   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 741   &generic_prefetch_tune
 742 };
 743
 744 static const struct tune_params cortexa35_tunings =
 745 {
 746   &cortexa53_extra_costs,
 747   &generic_addrcost_table,
 748   &cortexa53_regmove_cost,
 749   &generic_vector_cost,
 750   &generic_branch_cost,
 751   &generic_approx_modes,
 752   SVE_NOT_IMPLEMENTED, /* sve_width  */
 753   4, /* memmov_cost  */
 754   1, /* issue_rate  */
 755   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 756    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 757   "16", /* function_align.  */
 758   "4",  /* jump_align.  */
 759   "8",  /* loop_align.  */
 760   2,    /* int_reassoc_width.  */
 761   4,    /* fp_reassoc_width.  */
 762   1,    /* vec_reassoc_width.  */
 763   2,    /* min_div_recip_mul_sf.  */
 764   2,    /* min_div_recip_mul_df.  */
 765   0,    /* max_case_values.  */
 766   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 767   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 768   &generic_prefetch_tune
 769 };
 770
 771 static const struct tune_params cortexa53_tunings =
 772 {
 773   &cortexa53_extra_costs,
 774   &generic_addrcost_table,
 775   &cortexa53_regmove_cost,
 776   &generic_vector_cost,
 777   &generic_branch_cost,
 778   &generic_approx_modes,
 779   SVE_NOT_IMPLEMENTED, /* sve_width  */
 780   4, /* memmov_cost  */
 781   2, /* issue_rate  */
 782   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 783    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 784   "16", /* function_align.  */
 785   "4",  /* jump_align.  */
 786   "8",  /* loop_align.  */
 787   2,    /* int_reassoc_width.  */
 788   4,    /* fp_reassoc_width.  */
 789   1,    /* vec_reassoc_width.  */
 790   2,    /* min_div_recip_mul_sf.  */
 791   2,    /* min_div_recip_mul_df.  */
 792   0,    /* max_case_values.  */
 793   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 794   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 795   &generic_prefetch_tune
 796 };
 797
 798 static const struct tune_params cortexa57_tunings =
 799 {
 800   &cortexa57_extra_costs,
 801   &generic_addrcost_table,
 802   &cortexa57_regmove_cost,
 803   &cortexa57_vector_cost,
 804   &generic_branch_cost,
 805   &generic_approx_modes,
 806   SVE_NOT_IMPLEMENTED, /* sve_width  */
 807   4, /* memmov_cost  */
 808   3, /* issue_rate  */
 809   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 810    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 811   "16", /* function_align.  */
 812   "4",  /* jump_align.  */
 813   "8",  /* loop_align.  */
 814   2,    /* int_reassoc_width.  */
 815   4,    /* fp_reassoc_width.  */
 816   1,    /* vec_reassoc_width.  */
 817   2,    /* min_div_recip_mul_sf.  */
 818   2,    /* min_div_recip_mul_df.  */
 819   0,    /* max_case_values.  */
 820   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 821   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 822   &generic_prefetch_tune
 823 };
 824
 825 static const struct tune_params cortexa72_tunings =
 826 {
 827   &cortexa57_extra_costs,
 828   &generic_addrcost_table,
 829   &cortexa57_regmove_cost,
 830   &cortexa57_vector_cost,
 831   &generic_branch_cost,
 832   &generic_approx_modes,
 833   SVE_NOT_IMPLEMENTED, /* sve_width  */
 834   4, /* memmov_cost  */
 835   3, /* issue_rate  */
 836   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 837    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 838   "16", /* function_align.  */
 839   "4",  /* jump_align.  */
 840   "8",  /* loop_align.  */
 841   2,    /* int_reassoc_width.  */
 842   4,    /* fp_reassoc_width.  */
 843   1,    /* vec_reassoc_width.  */
 844   2,    /* min_div_recip_mul_sf.  */
 845   2,    /* min_div_recip_mul_df.  */
 846   0,    /* max_case_values.  */
 847   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 848   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 849   &generic_prefetch_tune
 850 };
 851
 852 static const struct tune_params cortexa73_tunings =
 853 {
 854   &cortexa57_extra_costs,
 855   &generic_addrcost_table,
 856   &cortexa57_regmove_cost,
 857   &cortexa57_vector_cost,
 858   &generic_branch_cost,
 859   &generic_approx_modes,
 860   SVE_NOT_IMPLEMENTED, /* sve_width  */
 861   4, /* memmov_cost.  */
 862   2, /* issue_rate.  */
 863   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 864    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 865   "16", /* function_align.  */
 866   "4",  /* jump_align.  */
 867   "8",  /* loop_align.  */
 868   2,    /* int_reassoc_width.  */
 869   4,    /* fp_reassoc_width.  */
 870   1,    /* vec_reassoc_width.  */
 871   2,    /* min_div_recip_mul_sf.  */
 872   2,    /* min_div_recip_mul_df.  */
 873   0,    /* max_case_values.  */
 874   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 875   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 876   &generic_prefetch_tune
 877 };
 878
 879
 880
 881 static const struct tune_params exynosm1_tunings =
 882 {
 883   &exynosm1_extra_costs,
 884   &exynosm1_addrcost_table,
 885   &exynosm1_regmove_cost,
 886   &exynosm1_vector_cost,
 887   &generic_branch_cost,
 888   &exynosm1_approx_modes,
 889   SVE_NOT_IMPLEMENTED, /* sve_width  */
 890   4,    /* memmov_cost  */
 891   3,    /* issue_rate  */
 892   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 893   "4",  /* function_align.  */
 894   "4",  /* jump_align.  */
 895   "4",  /* loop_align.  */
 896   2,    /* int_reassoc_width.  */
 897   4,    /* fp_reassoc_width.  */
 898   1,    /* vec_reassoc_width.  */
 899   2,    /* min_div_recip_mul_sf.  */
 900   2,    /* min_div_recip_mul_df.  */
 901   48,   /* max_case_values.  */
 902   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 903   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 904   &exynosm1_prefetch_tune
 905 };
 906
 907 static const struct tune_params thunderxt88_tunings =
 908 {
 909   &thunderx_extra_costs,
 910   &generic_addrcost_table,
 911   &thunderx_regmove_cost,
 912   &thunderx_vector_cost,
 913   &generic_branch_cost,
 914   &generic_approx_modes,
 915   SVE_NOT_IMPLEMENTED, /* sve_width  */
 916   6, /* memmov_cost  */
 917   2, /* issue_rate  */
 918   AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
 919   "8",  /* function_align.  */
 920   "8",  /* jump_align.  */
 921   "8",  /* loop_align.  */
 922   2,    /* int_reassoc_width.  */
 923   4,    /* fp_reassoc_width.  */
 924   1,    /* vec_reassoc_width.  */
 925   2,    /* min_div_recip_mul_sf.  */
 926   2,    /* min_div_recip_mul_df.  */
 927   0,    /* max_case_values.  */
 928   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 929   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 930   &thunderxt88_prefetch_tune
 931 };
 932
 933 static const struct tune_params thunderx_tunings =
 934 {
 935   &thunderx_extra_costs,
 936   &generic_addrcost_table,
 937   &thunderx_regmove_cost,
 938   &thunderx_vector_cost,
 939   &generic_branch_cost,
 940   &generic_approx_modes,
 941   SVE_NOT_IMPLEMENTED, /* sve_width  */
 942   6, /* memmov_cost  */
 943   2, /* issue_rate  */
 944   AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
 945   "8",  /* function_align.  */
 946   "8",  /* jump_align.  */
 947   "8",  /* loop_align.  */
 948   2,    /* int_reassoc_width.  */
 949   4,    /* fp_reassoc_width.  */
 950   1,    /* vec_reassoc_width.  */
 951   2,    /* min_div_recip_mul_sf.  */
 952   2,    /* min_div_recip_mul_df.  */
 953   0,    /* max_case_values.  */
 954   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 955   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 956    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 957   &thunderx_prefetch_tune
 958 };
 959
 960 static const struct tune_params tsv110_tunings =
 961 {
 962   &tsv110_extra_costs,
 963   &tsv110_addrcost_table,
 964   &tsv110_regmove_cost,
 965   &tsv110_vector_cost,
 966   &generic_branch_cost,
 967   &generic_approx_modes,
 968   SVE_NOT_IMPLEMENTED, /* sve_width  */
 969   4,    /* memmov_cost  */
 970   4,    /* issue_rate  */
 971   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
 972    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
 973   "16", /* function_align.  */
 974   "4",  /* jump_align.  */
 975   "8",  /* loop_align.  */
 976   2,    /* int_reassoc_width.  */
 977   4,    /* fp_reassoc_width.  */
 978   1,    /* vec_reassoc_width.  */
 979   2,    /* min_div_recip_mul_sf.  */
 980   2,    /* min_div_recip_mul_df.  */
 981   0,    /* max_case_values.  */
 982   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 983   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
 984   &tsv110_prefetch_tune
 985 };
 986
 987 static const struct tune_params xgene1_tunings =
 988 {
 989   &xgene1_extra_costs,
 990   &xgene1_addrcost_table,
 991   &xgene1_regmove_cost,
 992   &xgene1_vector_cost,
 993   &generic_branch_cost,
 994   &xgene1_approx_modes,
 995   SVE_NOT_IMPLEMENTED, /* sve_width  */
 996   6, /* memmov_cost  */
 997   4, /* issue_rate  */
 998   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 999   "16", /* function_align.  */
1000   "16", /* jump_align.  */
1001   "16", /* loop_align.  */
1002   2,    /* int_reassoc_width.  */
1003   4,    /* fp_reassoc_width.  */
1004   1,    /* vec_reassoc_width.  */
1005   2,    /* min_div_recip_mul_sf.  */
1006   2,    /* min_div_recip_mul_df.  */
1007   17,   /* max_case_values.  */
1008   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1009   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1010   &xgene1_prefetch_tune
1011 };
1012
1013 static const struct tune_params emag_tunings =
1014 {
1015   &xgene1_extra_costs,
1016   &xgene1_addrcost_table,
1017   &xgene1_regmove_cost,
1018   &xgene1_vector_cost,
1019   &generic_branch_cost,
1020   &xgene1_approx_modes,
1021   SVE_NOT_IMPLEMENTED,
1022   6, /* memmov_cost  */
1023   4, /* issue_rate  */
1024   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1025   "16", /* function_align.  */
1026   "16", /* jump_align.  */
1027   "16", /* loop_align.  */
1028   2,    /* int_reassoc_width.  */
1029   4,    /* fp_reassoc_width.  */
1030   1,    /* vec_reassoc_width.  */
1031   2,    /* min_div_recip_mul_sf.  */
1032   2,    /* min_div_recip_mul_df.  */
1033   17,   /* max_case_values.  */
1034   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1035   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1036   &xgene1_prefetch_tune
1037 };
1038
1039 static const struct tune_params qdf24xx_tunings =
1040 {
1041   &qdf24xx_extra_costs,
1042   &qdf24xx_addrcost_table,
1043   &qdf24xx_regmove_cost,
1044   &qdf24xx_vector_cost,
1045   &generic_branch_cost,
1046   &generic_approx_modes,
1047   SVE_NOT_IMPLEMENTED, /* sve_width  */
1048   4, /* memmov_cost  */
1049   4, /* issue_rate  */
1050   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1051    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1052   "16", /* function_align.  */
1053   "8",  /* jump_align.  */
1054   "16", /* loop_align.  */
1055   2,    /* int_reassoc_width.  */
1056   4,    /* fp_reassoc_width.  */
1057   1,    /* vec_reassoc_width.  */
1058   2,    /* min_div_recip_mul_sf.  */
1059   2,    /* min_div_recip_mul_df.  */
1060   0,    /* max_case_values.  */
1061   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1062   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1063   &qdf24xx_prefetch_tune
1064 };
1065
1066 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1067    for now.  */
1068 static const struct tune_params saphira_tunings =
1069 {
1070   &generic_extra_costs,
1071   &generic_addrcost_table,
1072   &generic_regmove_cost,
1073   &generic_vector_cost,
1074   &generic_branch_cost,
1075   &generic_approx_modes,
1076   SVE_NOT_IMPLEMENTED, /* sve_width  */
1077   4, /* memmov_cost  */
1078   4, /* issue_rate  */
1079   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1080    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1081   "16", /* function_align.  */
1082   "8",  /* jump_align.  */
1083   "16", /* loop_align.  */
1084   2,    /* int_reassoc_width.  */
1085   4,    /* fp_reassoc_width.  */
1086   1,    /* vec_reassoc_width.  */
1087   2,    /* min_div_recip_mul_sf.  */
1088   2,    /* min_div_recip_mul_df.  */
1089   0,    /* max_case_values.  */
1090   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1091   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1092   &generic_prefetch_tune
1093 };
1094
1095 static const struct tune_params thunderx2t99_tunings =
1096 {
1097   &thunderx2t99_extra_costs,
1098   &thunderx2t99_addrcost_table,
1099   &thunderx2t99_regmove_cost,
1100   &thunderx2t99_vector_cost,
1101   &generic_branch_cost,
1102   &generic_approx_modes,
1103   SVE_NOT_IMPLEMENTED, /* sve_width  */
1104   4, /* memmov_cost.  */
1105   4, /* issue_rate.  */
1106   (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1107    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1108   "16", /* function_align.  */
1109   "8",  /* jump_align.  */
1110   "16", /* loop_align.  */
1111   3,    /* int_reassoc_width.  */
1112   2,    /* fp_reassoc_width.  */
1113   2,    /* vec_reassoc_width.  */
1114   2,    /* min_div_recip_mul_sf.  */
1115   2,    /* min_div_recip_mul_df.  */
1116   0,    /* max_case_values.  */
1117   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1118   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1119   &thunderx2t99_prefetch_tune
1120 };
1121
1122 static const struct tune_params neoversen1_tunings =
1123 {
1124   &cortexa57_extra_costs,
1125   &generic_addrcost_table,
1126   &generic_regmove_cost,
1127   &cortexa57_vector_cost,
1128   &generic_branch_cost,
1129   &generic_approx_modes,
1130   SVE_NOT_IMPLEMENTED, /* sve_width  */
1131   4, /* memmov_cost  */
1132   3, /* issue_rate  */
1133   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
1134   "32:16",      /* function_align.  */
1135   "4",          /* jump_align.  */
1136   "32:16",      /* loop_align.  */
1137   2,    /* int_reassoc_width.  */
1138   4,    /* fp_reassoc_width.  */
1139   2,    /* vec_reassoc_width.  */
1140   2,    /* min_div_recip_mul_sf.  */
1141   2,    /* min_div_recip_mul_df.  */
1142   0,    /* max_case_values.  */
1143   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1144   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1145   &generic_prefetch_tune
1146 };
1147
1148 /* Support for fine-grained override of the tuning structures.  */
1149 struct aarch64_tuning_override_function
1150 {
1151   const char* name;
1152   void (*parse_override)(const char*, struct tune_params*);
1153 };
1154
1155 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1156 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1157 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1158
1159 static const struct aarch64_tuning_override_function
1160 aarch64_tuning_override_functions[] =
1161 {
1162   { "fuse", aarch64_parse_fuse_string },
1163   { "tune", aarch64_parse_tune_string },
1164   { "sve_width", aarch64_parse_sve_width_string },
1165   { NULL, NULL }
1166 };
1167
1168 /* A processor implementing AArch64.  */
1169 struct processor
1170 {
1171   const char *const name;
1172   enum aarch64_processor ident;
1173   enum aarch64_processor sched_core;
1174   enum aarch64_arch arch;
1175   unsigned architecture_version;
1176   const uint64_t flags;
1177   const struct tune_params *const tune;
1178 };
1179
1180 /* Architectures implementing AArch64.  */
1181 static const struct processor all_architectures[] =
1182 {
1183 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1184   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1185 #include "aarch64-arches.def"
1186   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1187 };
1188
1189 /* Processor cores implementing AArch64.  */
1190 static const struct processor all_cores[] =
1191 {
1192 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1193   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
1194   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
1195   FLAGS, &COSTS##_tunings},
1196 #include "aarch64-cores.def"
1197   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1198     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1199   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1200 };
1201
1202
1203 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1204    handling code or by target attributes.  */
1205 static const struct processor *selected_arch;
1206 static const struct processor *selected_cpu;
1207 static const struct processor *selected_tune;
1208
1209 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1210
1211 /* The current tuning set.  */
1212 struct tune_params aarch64_tune_params = generic_tunings;
1213
1214 /* Check whether an 'aarch64_vector_pcs' attribute is valid.  */
1215
1216 static tree
1217 handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
1218                                      int, bool *no_add_attrs)
1219 {
1220   /* Since we set fn_type_req to true, the caller should have checked
1221      this for us.  */
1222   gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
1223   switch ((arm_pcs) fntype_abi (*node).id ())
1224     {
1225     case ARM_PCS_AAPCS64:
1226     case ARM_PCS_SIMD:
1227       return NULL_TREE;
1228
1229     case ARM_PCS_SVE:
1230       error ("the %qE attribute cannot be applied to an SVE function type",
1231              name);
1232       *no_add_attrs = true;
1233       return NULL_TREE;
1234
1235     case ARM_PCS_TLSDESC:
1236     case ARM_PCS_UNKNOWN:
1237       break;
1238     }
1239   gcc_unreachable ();
1240 }
1241
1242 /* Table of machine attributes.  */
1243 static const struct attribute_spec aarch64_attribute_table[] =
1244 {
1245   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1246        affects_type_identity, handler, exclude } */
1247   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,
1248                           handle_aarch64_vector_pcs_attribute, NULL },
1249   { "SVE type",           3, 3, false, true,  false, true,  NULL, NULL },
1250   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
1251 };
1252
1253 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1254
1255 /* An ISA extension in the co-processor and main instruction set space.  */
1256 struct aarch64_option_extension
1257 {
1258   const char *const name;
1259   const unsigned long flags_on;
1260   const unsigned long flags_off;
1261 };
1262
1263 typedef enum aarch64_cond_code
1264 {
1265   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1266   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1267   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1268 }
1269 aarch64_cc;
1270
1271 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1272
1273 struct aarch64_branch_protect_type
1274 {
1275   /* The type's name that the user passes to the branch-protection option
1276     string.  */
1277   const char* name;
1278   /* Function to handle the protection type and set global variables.
1279     First argument is the string token corresponding with this type and the
1280     second argument is the next token in the option string.
1281     Return values:
1282     * AARCH64_PARSE_OK: Handling was sucessful.
1283     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1284       should print an error.
1285     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1286       own error.  */
1287   enum aarch64_parse_opt_result (*handler)(char*, char*);
1288   /* A list of types that can follow this type in the option string.  */
1289   const aarch64_branch_protect_type* subtypes;
1290   unsigned int num_subtypes;
1291 };
1292
1293 static enum aarch64_parse_opt_result
1294 aarch64_handle_no_branch_protection (char* str, char* rest)
1295 {
1296   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1297   aarch64_enable_bti = 0;
1298   if (rest)
1299     {
1300       error ("unexpected %<%s%> after %<%s%>", rest, str);
1301       return AARCH64_PARSE_INVALID_FEATURE;
1302     }
1303   return AARCH64_PARSE_OK;
1304 }
1305
1306 static enum aarch64_parse_opt_result
1307 aarch64_handle_standard_branch_protection (char* str, char* rest)
1308 {
1309   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1310   aarch64_ra_sign_key = AARCH64_KEY_A;
1311   aarch64_enable_bti = 1;
1312   if (rest)
1313     {
1314       error ("unexpected %<%s%> after %<%s%>", rest, str);
1315       return AARCH64_PARSE_INVALID_FEATURE;
1316     }
1317   return AARCH64_PARSE_OK;
1318 }
1319
1320 static enum aarch64_parse_opt_result
1321 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1322                                     char* rest ATTRIBUTE_UNUSED)
1323 {
1324   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1325   aarch64_ra_sign_key = AARCH64_KEY_A;
1326   return AARCH64_PARSE_OK;
1327 }
1328
1329 static enum aarch64_parse_opt_result
1330 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1331                               char* rest ATTRIBUTE_UNUSED)
1332 {
1333   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1334   return AARCH64_PARSE_OK;
1335 }
1336
1337 static enum aarch64_parse_opt_result
1338 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1339                               char* rest ATTRIBUTE_UNUSED)
1340 {
1341   aarch64_ra_sign_key = AARCH64_KEY_B;
1342   return AARCH64_PARSE_OK;
1343 }
1344
1345 static enum aarch64_parse_opt_result
1346 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1347                                     char* rest ATTRIBUTE_UNUSED)
1348 {
1349   aarch64_enable_bti = 1;
1350   return AARCH64_PARSE_OK;
1351 }
1352
1353 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1354   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1355   { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1356   { NULL, NULL, NULL, 0 }
1357 };
1358
1359 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1360   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1361   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1362   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1363     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1364   { "bti", aarch64_handle_bti_protection, NULL, 0 },
1365   { NULL, NULL, NULL, 0 }
1366 };
1367
1368 /* The condition codes of the processor, and the inverse function.  */
1369 static const char * const aarch64_condition_codes[] =
1370 {
1371   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1372   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1373 };
1374
1375 /* The preferred condition codes for SVE conditions.  */
1376 static const char *const aarch64_sve_condition_codes[] =
1377 {
1378   "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1379   "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1380 };
1381
1382 /* Return the assembly token for svpattern value VALUE.  */
1383
1384 static const char *
1385 svpattern_token (enum aarch64_svpattern pattern)
1386 {
1387   switch (pattern)
1388     {
1389 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1390     AARCH64_FOR_SVPATTERN (CASE)
1391 #undef CASE
1392     case AARCH64_NUM_SVPATTERNS:
1393       break;
1394     }
1395   gcc_unreachable ();
1396 }
1397
1398 /* Return the descriptor of the SIMD ABI.  */
1399
1400 static const predefined_function_abi &
1401 aarch64_simd_abi (void)
1402 {
1403   predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
1404   if (!simd_abi.initialized_p ())
1405     {
1406       HARD_REG_SET full_reg_clobbers
1407         = default_function_abi.full_reg_clobbers ();
1408       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1409         if (FP_SIMD_SAVED_REGNUM_P (regno))
1410           CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1411       simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
1412     }
1413   return simd_abi;
1414 }
1415
1416 /* Return the descriptor of the SVE PCS.  */
1417
1418 static const predefined_function_abi &
1419 aarch64_sve_abi (void)
1420 {
1421   predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
1422   if (!sve_abi.initialized_p ())
1423     {
1424       HARD_REG_SET full_reg_clobbers
1425         = default_function_abi.full_reg_clobbers ();
1426       for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
1427         CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1428       for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
1429         CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1430       sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
1431     }
1432   return sve_abi;
1433 }
1434
1435 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1436 const char *
1437 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1438                         const char * branch_format)
1439 {
1440     rtx_code_label * tmp_label = gen_label_rtx ();
1441     char label_buf[256];
1442     char buffer[128];
1443     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1444                                  CODE_LABEL_NUMBER (tmp_label));
1445     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1446     rtx dest_label = operands[pos_label];
1447     operands[pos_label] = tmp_label;
1448
1449     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1450     output_asm_insn (buffer, operands);
1451
1452     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1453     operands[pos_label] = dest_label;
1454     output_asm_insn (buffer, operands);
1455     return "";
1456 }
1457
1458 void
1459 aarch64_err_no_fpadvsimd (machine_mode mode)
1460 {
1461   if (TARGET_GENERAL_REGS_ONLY)
1462     if (FLOAT_MODE_P (mode))
1463       error ("%qs is incompatible with the use of floating-point types",
1464              "-mgeneral-regs-only");
1465     else
1466       error ("%qs is incompatible with the use of vector types",
1467              "-mgeneral-regs-only");
1468   else
1469     if (FLOAT_MODE_P (mode))
1470       error ("%qs feature modifier is incompatible with the use of"
1471              " floating-point types", "+nofp");
1472     else
1473       error ("%qs feature modifier is incompatible with the use of"
1474              " vector types", "+nofp");
1475 }
1476
1477 /* Report when we try to do something that requires SVE when SVE is disabled.
1478    This is an error of last resort and isn't very high-quality.  It usually
1479    involves attempts to measure the vector length in some way.  */
1480 static void
1481 aarch64_report_sve_required (void)
1482 {
1483   static bool reported_p = false;
1484
1485   /* Avoid reporting a slew of messages for a single oversight.  */
1486   if (reported_p)
1487     return;
1488
1489   error ("this operation requires the SVE ISA extension");
1490   inform (input_location, "you can enable SVE using the command-line"
1491           " option %<-march%>, or by using the %<target%>"
1492           " attribute or pragma");
1493   reported_p = true;
1494 }
1495
1496 /* Return true if REGNO is P0-P15 or one of the special FFR-related
1497    registers.  */
1498 inline bool
1499 pr_or_ffr_regnum_p (unsigned int regno)
1500 {
1501   return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
1502 }
1503
1504 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1505    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1506    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1507    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1508    and GENERAL_REGS is lower than the memory cost (in this case the best class
1509    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1510    cost results in bad allocations with many redundant int<->FP moves which
1511    are expensive on various cores.
1512    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1513    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1514    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1515    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1516    The result of this is that it is no longer inefficient to have a higher
1517    memory move cost than the register move cost.
1518 */
1519
1520 static reg_class_t
1521 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1522                                          reg_class_t best_class)
1523 {
1524   machine_mode mode;
1525
1526   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1527       || !reg_class_subset_p (FP_REGS, allocno_class))
1528     return allocno_class;
1529
1530   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1531       || !reg_class_subset_p (FP_REGS, best_class))
1532     return best_class;
1533
1534   mode = PSEUDO_REGNO_MODE (regno);
1535   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1536 }
1537
1538 static unsigned int
1539 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1540 {
1541   if (GET_MODE_UNIT_SIZE (mode) == 4)
1542     return aarch64_tune_params.min_div_recip_mul_sf;
1543   return aarch64_tune_params.min_div_recip_mul_df;
1544 }
1545
1546 /* Return the reassociation width of treeop OPC with mode MODE.  */
1547 static int
1548 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1549 {
1550   if (VECTOR_MODE_P (mode))
1551     return aarch64_tune_params.vec_reassoc_width;
1552   if (INTEGRAL_MODE_P (mode))
1553     return aarch64_tune_params.int_reassoc_width;
1554   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1555   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1556     return aarch64_tune_params.fp_reassoc_width;
1557   return 1;
1558 }
1559
1560 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1561 unsigned
1562 aarch64_dbx_register_number (unsigned regno)
1563 {
1564    if (GP_REGNUM_P (regno))
1565      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1566    else if (regno == SP_REGNUM)
1567      return AARCH64_DWARF_SP;
1568    else if (FP_REGNUM_P (regno))
1569      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1570    else if (PR_REGNUM_P (regno))
1571      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1572    else if (regno == VG_REGNUM)
1573      return AARCH64_DWARF_VG;
1574
1575    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1576       equivalent DWARF register.  */
1577    return DWARF_FRAME_REGISTERS;
1578 }
1579
1580 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1581    integer, otherwise return X unmodified.  */
1582 static rtx
1583 aarch64_bit_representation (rtx x)
1584 {
1585   if (CONST_DOUBLE_P (x))
1586     x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
1587   return x;
1588 }
1589
1590 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1591 static bool
1592 aarch64_advsimd_struct_mode_p (machine_mode mode)
1593 {
1594   return (TARGET_SIMD
1595           && (mode == OImode || mode == CImode || mode == XImode));
1596 }
1597
1598 /* Return true if MODE is an SVE predicate mode.  */
1599 static bool
1600 aarch64_sve_pred_mode_p (machine_mode mode)
1601 {
1602   return (TARGET_SVE
1603           && (mode == VNx16BImode
1604               || mode == VNx8BImode
1605               || mode == VNx4BImode
1606               || mode == VNx2BImode));
1607 }
1608
1609 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1610 const unsigned int VEC_ADVSIMD  = 1;
1611 const unsigned int VEC_SVE_DATA = 2;
1612 const unsigned int VEC_SVE_PRED = 4;
1613 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1614    a structure of 2, 3 or 4 vectors.  */
1615 const unsigned int VEC_STRUCT   = 8;
1616 /* Can be used in combination with VEC_SVE_DATA to indicate that the
1617    vector has fewer significant bytes than a full SVE vector.  */
1618 const unsigned int VEC_PARTIAL  = 16;
1619 /* Useful combinations of the above.  */
1620 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1621 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1622
1623 /* Return a set of flags describing the vector properties of mode MODE.
1624    Ignore modes that are not supported by the current target.  */
1625 static unsigned int
1626 aarch64_classify_vector_mode (machine_mode mode)
1627 {
1628   if (aarch64_advsimd_struct_mode_p (mode))
1629     return VEC_ADVSIMD | VEC_STRUCT;
1630
1631   if (aarch64_sve_pred_mode_p (mode))
1632     return VEC_SVE_PRED;
1633
1634   /* Make the decision based on the mode's enum value rather than its
1635      properties, so that we keep the correct classification regardless
1636      of -msve-vector-bits.  */
1637   switch (mode)
1638     {
1639     /* Partial SVE QI vectors.  */
1640     case E_VNx2QImode:
1641     case E_VNx4QImode:
1642     case E_VNx8QImode:
1643     /* Partial SVE HI vectors.  */
1644     case E_VNx2HImode:
1645     case E_VNx4HImode:
1646     /* Partial SVE SI vector.  */
1647     case E_VNx2SImode:
1648     /* Partial SVE HF vectors.  */
1649     case E_VNx2HFmode:
1650     case E_VNx4HFmode:
1651     /* Partial SVE SF vector.  */
1652     case E_VNx2SFmode:
1653       return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
1654
1655     case E_VNx16QImode:
1656     case E_VNx8HImode:
1657     case E_VNx4SImode:
1658     case E_VNx2DImode:
1659     case E_VNx8BFmode:
1660     case E_VNx8HFmode:
1661     case E_VNx4SFmode:
1662     case E_VNx2DFmode:
1663       return TARGET_SVE ? VEC_SVE_DATA : 0;
1664
1665     /* x2 SVE vectors.  */
1666     case E_VNx32QImode:
1667     case E_VNx16HImode:
1668     case E_VNx8SImode:
1669     case E_VNx4DImode:
1670     case E_VNx16BFmode:
1671     case E_VNx16HFmode:
1672     case E_VNx8SFmode:
1673     case E_VNx4DFmode:
1674     /* x3 SVE vectors.  */
1675     case E_VNx48QImode:
1676     case E_VNx24HImode:
1677     case E_VNx12SImode:
1678     case E_VNx6DImode:
1679     case E_VNx24BFmode:
1680     case E_VNx24HFmode:
1681     case E_VNx12SFmode:
1682     case E_VNx6DFmode:
1683     /* x4 SVE vectors.  */
1684     case E_VNx64QImode:
1685     case E_VNx32HImode:
1686     case E_VNx16SImode:
1687     case E_VNx8DImode:
1688     case E_VNx32BFmode:
1689     case E_VNx32HFmode:
1690     case E_VNx16SFmode:
1691     case E_VNx8DFmode:
1692       return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
1693
1694     /* 64-bit Advanced SIMD vectors.  */
1695     case E_V8QImode:
1696     case E_V4HImode:
1697     case E_V2SImode:
1698     /* ...E_V1DImode doesn't exist.  */
1699     case E_V4HFmode:
1700     case E_V4BFmode:
1701     case E_V2SFmode:
1702     case E_V1DFmode:
1703     /* 128-bit Advanced SIMD vectors.  */
1704     case E_V16QImode:
1705     case E_V8HImode:
1706     case E_V4SImode:
1707     case E_V2DImode:
1708     case E_V8HFmode:
1709     case E_V8BFmode:
1710     case E_V4SFmode:
1711     case E_V2DFmode:
1712       return TARGET_SIMD ? VEC_ADVSIMD : 0;
1713
1714     default:
1715       return 0;
1716     }
1717 }
1718
1719 /* Return true if MODE is any of the data vector modes, including
1720    structure modes.  */
1721 static bool
1722 aarch64_vector_data_mode_p (machine_mode mode)
1723 {
1724   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1725 }
1726
1727 /* Return true if MODE is any form of SVE mode, including predicates,
1728    vectors and structures.  */
1729 bool
1730 aarch64_sve_mode_p (machine_mode mode)
1731 {
1732   return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
1733 }
1734
1735 /* Return true if MODE is an SVE data vector mode; either a single vector
1736    or a structure of vectors.  */
1737 static bool
1738 aarch64_sve_data_mode_p (machine_mode mode)
1739 {
1740   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1741 }
1742
1743 /* Return the number of defined bytes in one constituent vector of
1744    SVE mode MODE, which has vector flags VEC_FLAGS.  */
1745 static poly_int64
1746 aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
1747 {
1748   if (vec_flags & VEC_PARTIAL)
1749     /* A single partial vector.  */
1750     return GET_MODE_SIZE (mode);
1751
1752   if (vec_flags & VEC_SVE_DATA)
1753     /* A single vector or a tuple.  */
1754     return BYTES_PER_SVE_VECTOR;
1755
1756   /* A single predicate.  */
1757   gcc_assert (vec_flags & VEC_SVE_PRED);
1758   return BYTES_PER_SVE_PRED;
1759 }
1760
1761 /* Implement target hook TARGET_ARRAY_MODE.  */
1762 static opt_machine_mode
1763 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1764 {
1765   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1766       && IN_RANGE (nelems, 2, 4))
1767     return mode_for_vector (GET_MODE_INNER (mode),
1768                             GET_MODE_NUNITS (mode) * nelems);
1769
1770   return opt_machine_mode ();
1771 }
1772
1773 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1774 static bool
1775 aarch64_array_mode_supported_p (machine_mode mode,
1776                                 unsigned HOST_WIDE_INT nelems)
1777 {
1778   if (TARGET_SIMD
1779       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1780           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1781       && (nelems >= 2 && nelems <= 4))
1782     return true;
1783
1784   return false;
1785 }
1786
1787 /* MODE is some form of SVE vector mode.  For data modes, return the number
1788    of vector register bits that each element of MODE occupies, such as 64
1789    for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
1790    in a 64-bit container).  For predicate modes, return the number of
1791    data bits controlled by each significant predicate bit.  */
1792
1793 static unsigned int
1794 aarch64_sve_container_bits (machine_mode mode)
1795 {
1796   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1797   poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
1798                              ? BITS_PER_SVE_VECTOR
1799                              : GET_MODE_BITSIZE (mode));
1800   return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
1801 }
1802
1803 /* Return the SVE predicate mode to use for elements that have
1804    ELEM_NBYTES bytes, if such a mode exists.  */
1805
1806 opt_machine_mode
1807 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1808 {
1809   if (TARGET_SVE)
1810     {
1811       if (elem_nbytes == 1)
1812         return VNx16BImode;
1813       if (elem_nbytes == 2)
1814         return VNx8BImode;
1815       if (elem_nbytes == 4)
1816         return VNx4BImode;
1817       if (elem_nbytes == 8)
1818         return VNx2BImode;
1819     }
1820   return opt_machine_mode ();
1821 }
1822
1823 /* Return the SVE predicate mode that should be used to control
1824    SVE mode MODE.  */
1825
1826 machine_mode
1827 aarch64_sve_pred_mode (machine_mode mode)
1828 {
1829   unsigned int bits = aarch64_sve_container_bits (mode);
1830   return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
1831 }
1832
1833 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1834
1835 static opt_machine_mode
1836 aarch64_get_mask_mode (machine_mode mode)
1837 {
1838   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1839   if (vec_flags & VEC_SVE_DATA)
1840     return aarch64_sve_pred_mode (mode);
1841
1842   return default_get_mask_mode (mode);
1843 }
1844
1845 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE.  */
1846
1847 opt_machine_mode
1848 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
1849 {
1850   enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
1851                             ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
1852   machine_mode mode;
1853   FOR_EACH_MODE_IN_CLASS (mode, mclass)
1854     if (inner_mode == GET_MODE_INNER (mode)
1855         && known_eq (nunits, GET_MODE_NUNITS (mode))
1856         && aarch64_sve_data_mode_p (mode))
1857       return mode;
1858   return opt_machine_mode ();
1859 }
1860
1861 /* Return the integer element mode associated with SVE mode MODE.  */
1862
1863 static scalar_int_mode
1864 aarch64_sve_element_int_mode (machine_mode mode)
1865 {
1866   poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
1867                              ? BITS_PER_SVE_VECTOR
1868                              : GET_MODE_BITSIZE (mode));
1869   unsigned int elt_bits = vector_element_size (vector_bits,
1870                                                GET_MODE_NUNITS (mode));
1871   return int_mode_for_size (elt_bits, 0).require ();
1872 }
1873
1874 /* Return an integer element mode that contains exactly
1875    aarch64_sve_container_bits (MODE) bits.  This is wider than
1876    aarch64_sve_element_int_mode if MODE is a partial vector,
1877    otherwise it's the same.  */
1878
1879 static scalar_int_mode
1880 aarch64_sve_container_int_mode (machine_mode mode)
1881 {
1882   return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
1883 }
1884
1885 /* Return the integer vector mode associated with SVE mode MODE.
1886    Unlike related_int_vector_mode, this can handle the case in which
1887    MODE is a predicate (and thus has a different total size).  */
1888
1889 machine_mode
1890 aarch64_sve_int_mode (machine_mode mode)
1891 {
1892   scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
1893   return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
1894 }
1895
1896 /* Implement TARGET_VECTORIZE_RELATED_MODE.  */
1897
1898 static opt_machine_mode
1899 aarch64_vectorize_related_mode (machine_mode vector_mode,
1900                                 scalar_mode element_mode,
1901                                 poly_uint64 nunits)
1902 {
1903   unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
1904
1905   /* If we're operating on SVE vectors, try to return an SVE mode.  */
1906   poly_uint64 sve_nunits;
1907   if ((vec_flags & VEC_SVE_DATA)
1908       && multiple_p (BYTES_PER_SVE_VECTOR,
1909                      GET_MODE_SIZE (element_mode), &sve_nunits))
1910     {
1911       machine_mode sve_mode;
1912       if (maybe_ne (nunits, 0U))
1913         {
1914           /* Try to find a full or partial SVE mode with exactly
1915              NUNITS units.  */
1916           if (multiple_p (sve_nunits, nunits)
1917               && aarch64_sve_data_mode (element_mode,
1918                                         nunits).exists (&sve_mode))
1919             return sve_mode;
1920         }
1921       else
1922         {
1923           /* Take the preferred number of units from the number of bytes
1924              that fit in VECTOR_MODE.  We always start by "autodetecting"
1925              a full vector mode with preferred_simd_mode, so vectors
1926              chosen here will also be full vector modes.  Then
1927              autovectorize_vector_modes tries smaller starting modes
1928              and thus smaller preferred numbers of units.  */
1929           sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
1930           if (aarch64_sve_data_mode (element_mode,
1931                                      sve_nunits).exists (&sve_mode))
1932             return sve_mode;
1933         }
1934     }
1935
1936   /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors.  */
1937   if ((vec_flags & VEC_ADVSIMD)
1938       && known_eq (nunits, 0U)
1939       && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
1940       && maybe_ge (GET_MODE_BITSIZE (element_mode)
1941                    * GET_MODE_NUNITS (vector_mode), 128U))
1942     {
1943       machine_mode res = aarch64_simd_container_mode (element_mode, 128);
1944       if (VECTOR_MODE_P (res))
1945         return res;
1946     }
1947
1948   return default_vectorize_related_mode (vector_mode, element_mode, nunits);
1949 }
1950
1951 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
1952    prefer to use the first arithmetic operand as the else value if
1953    the else value doesn't matter, since that exactly matches the SVE
1954    destructive merging form.  For ternary operations we could either
1955    pick the first operand and use FMAD-like instructions or the last
1956    operand and use FMLA-like instructions; the latter seems more
1957    natural.  */
1958
1959 static tree
1960 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1961 {
1962   return nops == 3 ? ops[2] : ops[0];
1963 }
1964
1965 /* Implement TARGET_HARD_REGNO_NREGS.  */
1966
1967 static unsigned int
1968 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1969 {
1970   /* ??? Logically we should only need to provide a value when
1971      HARD_REGNO_MODE_OK says that the combination is valid,
1972      but at the moment we need to handle all modes.  Just ignore
1973      any runtime parts for registers that can't store them.  */
1974   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1975   switch (aarch64_regno_regclass (regno))
1976     {
1977     case FP_REGS:
1978     case FP_LO_REGS:
1979     case FP_LO8_REGS:
1980       {
1981         unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1982         if (vec_flags & VEC_SVE_DATA)
1983           return exact_div (GET_MODE_SIZE (mode),
1984                             aarch64_vl_bytes (mode, vec_flags)).to_constant ();
1985         return CEIL (lowest_size, UNITS_PER_VREG);
1986       }
1987     case PR_REGS:
1988     case PR_LO_REGS:
1989     case PR_HI_REGS:
1990     case FFR_REGS:
1991     case PR_AND_FFR_REGS:
1992       return 1;
1993     default:
1994       return CEIL (lowest_size, UNITS_PER_WORD);
1995     }
1996   gcc_unreachable ();
1997 }
1998
1999 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
2000
2001 static bool
2002 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
2003 {
2004   if (GET_MODE_CLASS (mode) == MODE_CC)
2005     return regno == CC_REGNUM;
2006
2007   if (regno == VG_REGNUM)
2008     /* This must have the same size as _Unwind_Word.  */
2009     return mode == DImode;
2010
2011   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2012   if (vec_flags & VEC_SVE_PRED)
2013     return pr_or_ffr_regnum_p (regno);
2014
2015   if (pr_or_ffr_regnum_p (regno))
2016     return false;
2017
2018   if (regno == SP_REGNUM)
2019     /* The purpose of comparing with ptr_mode is to support the
2020        global register variable associated with the stack pointer
2021        register via the syntax of asm ("wsp") in ILP32.  */
2022     return mode == Pmode || mode == ptr_mode;
2023
2024   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
2025     return mode == Pmode;
2026
2027   if (GP_REGNUM_P (regno))
2028     {
2029       if (vec_flags & VEC_ANY_SVE)
2030         return false;
2031       if (known_le (GET_MODE_SIZE (mode), 8))
2032         return true;
2033       if (known_le (GET_MODE_SIZE (mode), 16))
2034         return (regno & 1) == 0;
2035     }
2036   else if (FP_REGNUM_P (regno))
2037     {
2038       if (vec_flags & VEC_STRUCT)
2039         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
2040       else
2041         return !VECTOR_MODE_P (mode) || vec_flags != 0;
2042     }
2043
2044   return false;
2045 }
2046
2047 /* Return true if TYPE is a type that should be passed or returned in
2048    SVE registers, assuming enough registers are available.  When returning
2049    true, set *NUM_ZR and *NUM_PR to the number of required Z and P registers
2050    respectively.  */
2051
2052 /* Return true if a function with type FNTYPE returns its value in
2053    SVE vector or predicate registers.  */
2054
2055 static bool
2056 aarch64_returns_value_in_sve_regs_p (const_tree fntype)
2057 {
2058   tree return_type = TREE_TYPE (fntype);
2059   return (return_type != error_mark_node
2060           && aarch64_sve::builtin_type_p (return_type));
2061 }
2062
2063 /* Return true if a function with type FNTYPE takes arguments in
2064    SVE vector or predicate registers.  */
2065
2066 static bool
2067 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
2068 {
2069   CUMULATIVE_ARGS args_so_far_v;
2070   aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
2071                                 NULL_TREE, 0, true);
2072   cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
2073
2074   for (tree chain = TYPE_ARG_TYPES (fntype);
2075        chain && chain != void_list_node;
2076        chain = TREE_CHAIN (chain))
2077     {
2078       tree arg_type = TREE_VALUE (chain);
2079       if (arg_type == error_mark_node)
2080         return false;
2081
2082       function_arg_info arg (arg_type, /*named=*/true);
2083       apply_pass_by_reference_rules (&args_so_far_v, arg);
2084       if (aarch64_sve::builtin_type_p (arg.type))
2085         return true;
2086
2087       targetm.calls.function_arg_advance (args_so_far, arg);
2088     }
2089   return false;
2090 }
2091
2092 /* Implement TARGET_FNTYPE_ABI.  */
2093
2094 static const predefined_function_abi &
2095 aarch64_fntype_abi (const_tree fntype)
2096 {
2097   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
2098     return aarch64_simd_abi ();
2099
2100   if (aarch64_returns_value_in_sve_regs_p (fntype)
2101       || aarch64_takes_arguments_in_sve_regs_p (fntype))
2102     return aarch64_sve_abi ();
2103
2104   return default_function_abi;
2105 }
2106
2107 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P.  */
2108
2109 static bool
2110 aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
2111 {
2112   return (aarch64_sve::builtin_type_p (type1)
2113           == aarch64_sve::builtin_type_p (type2));
2114 }
2115
2116 /* Return true if we should emit CFI for register REGNO.  */
2117
2118 static bool
2119 aarch64_emit_cfi_for_reg_p (unsigned int regno)
2120 {
2121   return (GP_REGNUM_P (regno)
2122           || !default_function_abi.clobbers_full_reg_p (regno));
2123 }
2124
2125 /* Return the mode we should use to save and restore register REGNO.  */
2126
2127 static machine_mode
2128 aarch64_reg_save_mode (unsigned int regno)
2129 {
2130   if (GP_REGNUM_P (regno))
2131     return DImode;
2132
2133   if (FP_REGNUM_P (regno))
2134     switch (crtl->abi->id ())
2135       {
2136       case ARM_PCS_AAPCS64:
2137         /* Only the low 64 bits are saved by the base PCS.  */
2138         return DFmode;
2139
2140       case ARM_PCS_SIMD:
2141         /* The vector PCS saves the low 128 bits (which is the full
2142            register on non-SVE targets).  */
2143         return TFmode;
2144
2145       case ARM_PCS_SVE:
2146         /* Use vectors of DImode for registers that need frame
2147            information, so that the first 64 bytes of the save slot
2148            are always the equivalent of what storing D<n> would give.  */
2149         if (aarch64_emit_cfi_for_reg_p (regno))
2150           return VNx2DImode;
2151
2152         /* Use vectors of bytes otherwise, so that the layout is
2153            endian-agnostic, and so that we can use LDR and STR for
2154            big-endian targets.  */
2155         return VNx16QImode;
2156
2157       case ARM_PCS_TLSDESC:
2158       case ARM_PCS_UNKNOWN:
2159         break;
2160       }
2161
2162   if (PR_REGNUM_P (regno))
2163     /* Save the full predicate register.  */
2164     return VNx16BImode;
2165
2166   gcc_unreachable ();
2167 }
2168
2169 /* Implement TARGET_INSN_CALLEE_ABI.  */
2170
2171 const predefined_function_abi &
2172 aarch64_insn_callee_abi (const rtx_insn *insn)
2173 {
2174   rtx pat = PATTERN (insn);
2175   gcc_assert (GET_CODE (pat) == PARALLEL);
2176   rtx unspec = XVECEXP (pat, 0, 1);
2177   gcc_assert (GET_CODE (unspec) == UNSPEC
2178               && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
2179   return function_abis[INTVAL (XVECEXP (unspec, 0, 0))];
2180 }
2181
2182 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
2183    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
2184    clobbers the top 64 bits when restoring the bottom 64 bits.  */
2185
2186 static bool
2187 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
2188                                         unsigned int regno,
2189                                         machine_mode mode)
2190 {
2191   if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
2192     {
2193       poly_int64 per_register_size = GET_MODE_SIZE (mode);
2194       unsigned int nregs = hard_regno_nregs (regno, mode);
2195       if (nregs > 1)
2196         per_register_size = exact_div (per_register_size, nregs);
2197       if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
2198         return maybe_gt (per_register_size, 16);
2199       return maybe_gt (per_register_size, 8);
2200     }
2201   return false;
2202 }
2203
2204 /* Implement REGMODE_NATURAL_SIZE.  */
2205 poly_uint64
2206 aarch64_regmode_natural_size (machine_mode mode)
2207 {
2208   /* The natural size for SVE data modes is one SVE data vector,
2209      and similarly for predicates.  We can't independently modify
2210      anything smaller than that.  */
2211   /* ??? For now, only do this for variable-width SVE registers.
2212      Doing it for constant-sized registers breaks lower-subreg.c.  */
2213   /* ??? And once that's fixed, we should probably have similar
2214      code for Advanced SIMD.  */
2215   if (!aarch64_sve_vg.is_constant ())
2216     {
2217       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2218       if (vec_flags & VEC_SVE_PRED)
2219         return BYTES_PER_SVE_PRED;
2220       if (vec_flags & VEC_SVE_DATA)
2221         return BYTES_PER_SVE_VECTOR;
2222     }
2223   return UNITS_PER_WORD;
2224 }
2225
2226 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
2227 machine_mode
2228 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
2229                                      machine_mode mode)
2230 {
2231   /* The predicate mode determines which bits are significant and
2232      which are "don't care".  Decreasing the number of lanes would
2233      lose data while increasing the number of lanes would make bits
2234      unnecessarily significant.  */
2235   if (PR_REGNUM_P (regno))
2236     return mode;
2237   if (known_ge (GET_MODE_SIZE (mode), 4))
2238     return mode;
2239   else
2240     return SImode;
2241 }
2242
2243 /* Return true if I's bits are consecutive ones from the MSB.  */
2244 bool
2245 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
2246 {
2247   return exact_log2 (-i) != HOST_WIDE_INT_M1;
2248 }
2249
2250 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
2251    that strcpy from constants will be faster.  */
2252
2253 static HOST_WIDE_INT
2254 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
2255 {
2256   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
2257     return MAX (align, BITS_PER_WORD);
2258   return align;
2259 }
2260
2261 /* Return true if calls to DECL should be treated as
2262    long-calls (ie called via a register).  */
2263 static bool
2264 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
2265 {
2266   return false;
2267 }
2268
2269 /* Return true if calls to symbol-ref SYM should be treated as
2270    long-calls (ie called via a register).  */
2271 bool
2272 aarch64_is_long_call_p (rtx sym)
2273 {
2274   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
2275 }
2276
2277 /* Return true if calls to symbol-ref SYM should not go through
2278    plt stubs.  */
2279
2280 bool
2281 aarch64_is_noplt_call_p (rtx sym)
2282 {
2283   const_tree decl = SYMBOL_REF_DECL (sym);
2284
2285   if (flag_pic
2286       && decl
2287       && (!flag_plt
2288           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
2289       && !targetm.binds_local_p (decl))
2290     return true;
2291
2292   return false;
2293 }
2294
2295 /* Return true if the offsets to a zero/sign-extract operation
2296    represent an expression that matches an extend operation.  The
2297    operands represent the parameters from
2298
2299    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
2300 bool
2301 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
2302                                 rtx extract_imm)
2303 {
2304   HOST_WIDE_INT mult_val, extract_val;
2305
2306   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
2307     return false;
2308
2309   mult_val = INTVAL (mult_imm);
2310   extract_val = INTVAL (extract_imm);
2311
2312   if (extract_val > 8
2313       && extract_val < GET_MODE_BITSIZE (mode)
2314       && exact_log2 (extract_val & ~7) > 0
2315       && (extract_val & 7) <= 4
2316       && mult_val == (1 << (extract_val & 7)))
2317     return true;
2318
2319   return false;
2320 }
2321
2322 /* Emit an insn that's a simple single-set.  Both the operands must be
2323    known to be valid.  */
2324 inline static rtx_insn *
2325 emit_set_insn (rtx x, rtx y)
2326 {
2327   return emit_insn (gen_rtx_SET (x, y));
2328 }
2329
2330 /* X and Y are two things to compare using CODE.  Emit the compare insn and
2331    return the rtx for register 0 in the proper mode.  */
2332 rtx
2333 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2334 {
2335   machine_mode cmp_mode = GET_MODE (x);
2336   machine_mode cc_mode;
2337   rtx cc_reg;
2338
2339   if (cmp_mode == TImode)
2340     {
2341       gcc_assert (code == NE);
2342
2343       cc_mode = CCmode;
2344       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2345
2346       rtx x_lo = operand_subword (x, 0, 0, TImode);
2347       rtx y_lo = operand_subword (y, 0, 0, TImode);
2348       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
2349
2350       rtx x_hi = operand_subword (x, 1, 0, TImode);
2351       rtx y_hi = operand_subword (y, 1, 0, TImode);
2352       emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
2353                                gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
2354                                GEN_INT (AARCH64_EQ)));
2355     }
2356   else
2357     {
2358       cc_mode = SELECT_CC_MODE (code, x, y);
2359       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2360       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
2361     }
2362   return cc_reg;
2363 }
2364
2365 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
2366
2367 static rtx
2368 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2369                                   machine_mode y_mode)
2370 {
2371   if (y_mode == E_QImode || y_mode == E_HImode)
2372     {
2373       if (CONST_INT_P (y))
2374         y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2375       else
2376         {
2377           rtx t, cc_reg;
2378           machine_mode cc_mode;
2379
2380           t = gen_rtx_ZERO_EXTEND (SImode, y);
2381           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2382           cc_mode = CC_SWPmode;
2383           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2384           emit_set_insn (cc_reg, t);
2385           return cc_reg;
2386         }
2387     }
2388
2389   if (!aarch64_plus_operand (y, y_mode))
2390     y = force_reg (y_mode, y);
2391
2392   return aarch64_gen_compare_reg (code, x, y);
2393 }
2394
2395 /* Build the SYMBOL_REF for __tls_get_addr.  */
2396
2397 static GTY(()) rtx tls_get_addr_libfunc;
2398
2399 rtx
2400 aarch64_tls_get_addr (void)
2401 {
2402   if (!tls_get_addr_libfunc)
2403     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2404   return tls_get_addr_libfunc;
2405 }
2406
2407 /* Return the TLS model to use for ADDR.  */
2408
2409 static enum tls_model
2410 tls_symbolic_operand_type (rtx addr)
2411 {
2412   enum tls_model tls_kind = TLS_MODEL_NONE;
2413   if (GET_CODE (addr) == CONST)
2414     {
2415       poly_int64 addend;
2416       rtx sym = strip_offset (addr, &addend);
2417       if (GET_CODE (sym) == SYMBOL_REF)
2418         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2419     }
2420   else if (GET_CODE (addr) == SYMBOL_REF)
2421     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2422
2423   return tls_kind;
2424 }
2425
2426 /* We'll allow lo_sum's in addresses in our legitimate addresses
2427    so that combine would take care of combining addresses where
2428    necessary, but for generation purposes, we'll generate the address
2429    as :
2430    RTL                               Absolute
2431    tmp = hi (symbol_ref);            adrp  x1, foo
2432    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
2433                                      nop
2434
2435    PIC                               TLS
2436    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
2437    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
2438                                      bl   __tls_get_addr
2439                                      nop
2440
2441    Load TLS symbol, depending on TLS mechanism and TLS access model.
2442
2443    Global Dynamic - Traditional TLS:
2444    adrp tmp, :tlsgd:imm
2445    add  dest, tmp, #:tlsgd_lo12:imm
2446    bl   __tls_get_addr
2447
2448    Global Dynamic - TLS Descriptors:
2449    adrp dest, :tlsdesc:imm
2450    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
2451    add  dest, dest, #:tlsdesc_lo12:imm
2452    blr  tmp
2453    mrs  tp, tpidr_el0
2454    add  dest, dest, tp
2455
2456    Initial Exec:
2457    mrs  tp, tpidr_el0
2458    adrp tmp, :gottprel:imm
2459    ldr  dest, [tmp, #:gottprel_lo12:imm]
2460    add  dest, dest, tp
2461
2462    Local Exec:
2463    mrs  tp, tpidr_el0
2464    add  t0, tp, #:tprel_hi12:imm, lsl #12
2465    add  t0, t0, #:tprel_lo12_nc:imm
2466 */
2467
2468 static void
2469 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2470                                    enum aarch64_symbol_type type)
2471 {
2472   switch (type)
2473     {
2474     case SYMBOL_SMALL_ABSOLUTE:
2475       {
2476         /* In ILP32, the mode of dest can be either SImode or DImode.  */
2477         rtx tmp_reg = dest;
2478         machine_mode mode = GET_MODE (dest);
2479
2480         gcc_assert (mode == Pmode || mode == ptr_mode);
2481
2482         if (can_create_pseudo_p ())
2483           tmp_reg = gen_reg_rtx (mode);
2484
2485         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2486         emit_insn (gen_add_losym (dest, tmp_reg, imm));
2487         return;
2488       }
2489
2490     case SYMBOL_TINY_ABSOLUTE:
2491       emit_insn (gen_rtx_SET (dest, imm));
2492       return;
2493
2494     case SYMBOL_SMALL_GOT_28K:
2495       {
2496         machine_mode mode = GET_MODE (dest);
2497         rtx gp_rtx = pic_offset_table_rtx;
2498         rtx insn;
2499         rtx mem;
2500
2501         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2502            here before rtl expand.  Tree IVOPT will generate rtl pattern to
2503            decide rtx costs, in which case pic_offset_table_rtx is not
2504            initialized.  For that case no need to generate the first adrp
2505            instruction as the final cost for global variable access is
2506            one instruction.  */
2507         if (gp_rtx != NULL)
2508           {
2509             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2510                using the page base as GOT base, the first page may be wasted,
2511                in the worst scenario, there is only 28K space for GOT).
2512
2513                The generate instruction sequence for accessing global variable
2514                is:
2515
2516                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2517
2518                Only one instruction needed. But we must initialize
2519                pic_offset_table_rtx properly.  We generate initialize insn for
2520                every global access, and allow CSE to remove all redundant.
2521
2522                The final instruction sequences will look like the following
2523                for multiply global variables access.
2524
2525                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2526
2527                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2528                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2529                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2530                  ...  */
2531
2532             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2533             crtl->uses_pic_offset_table = 1;
2534             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2535
2536             if (mode != GET_MODE (gp_rtx))
2537              gp_rtx = gen_lowpart (mode, gp_rtx);
2538
2539           }
2540
2541         if (mode == ptr_mode)
2542           {
2543             if (mode == DImode)
2544               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2545             else
2546               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2547
2548             mem = XVECEXP (SET_SRC (insn), 0, 0);
2549           }
2550         else
2551           {
2552             gcc_assert (mode == Pmode);
2553
2554             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2555             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2556           }
2557
2558         /* The operand is expected to be MEM.  Whenever the related insn
2559            pattern changed, above code which calculate mem should be
2560            updated.  */
2561         gcc_assert (GET_CODE (mem) == MEM);
2562         MEM_READONLY_P (mem) = 1;
2563         MEM_NOTRAP_P (mem) = 1;
2564         emit_insn (insn);
2565         return;
2566       }
2567
2568     case SYMBOL_SMALL_GOT_4G:
2569       {
2570         /* In ILP32, the mode of dest can be either SImode or DImode,
2571            while the got entry is always of SImode size.  The mode of
2572            dest depends on how dest is used: if dest is assigned to a
2573            pointer (e.g. in the memory), it has SImode; it may have
2574            DImode if dest is dereferenced to access the memeory.
2575            This is why we have to handle three different ldr_got_small
2576            patterns here (two patterns for ILP32).  */
2577
2578         rtx insn;
2579         rtx mem;
2580         rtx tmp_reg = dest;
2581         machine_mode mode = GET_MODE (dest);
2582
2583         if (can_create_pseudo_p ())
2584           tmp_reg = gen_reg_rtx (mode);
2585
2586         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2587         if (mode == ptr_mode)
2588           {
2589             if (mode == DImode)
2590               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2591             else
2592               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2593
2594             mem = XVECEXP (SET_SRC (insn), 0, 0);
2595           }
2596         else
2597           {
2598             gcc_assert (mode == Pmode);
2599
2600             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2601             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2602           }
2603
2604         gcc_assert (GET_CODE (mem) == MEM);
2605         MEM_READONLY_P (mem) = 1;
2606         MEM_NOTRAP_P (mem) = 1;
2607         emit_insn (insn);
2608         return;
2609       }
2610
2611     case SYMBOL_SMALL_TLSGD:
2612       {
2613         rtx_insn *insns;
2614         /* The return type of __tls_get_addr is the C pointer type
2615            so use ptr_mode.  */
2616         rtx result = gen_rtx_REG (ptr_mode, R0_REGNUM);
2617         rtx tmp_reg = dest;
2618
2619         if (GET_MODE (dest) != ptr_mode)
2620           tmp_reg = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : result;
2621
2622         start_sequence ();
2623         if (ptr_mode == SImode)
2624           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2625         else
2626           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2627         insns = get_insns ();
2628         end_sequence ();
2629
2630         RTL_CONST_CALL_P (insns) = 1;
2631         emit_libcall_block (insns, tmp_reg, result, imm);
2632         /* Convert back to the mode of the dest adding a zero_extend
2633            from SImode (ptr_mode) to DImode (Pmode). */
2634         if (dest != tmp_reg)
2635           convert_move (dest, tmp_reg, true);
2636         return;
2637       }
2638
2639     case SYMBOL_SMALL_TLSDESC:
2640       {
2641         machine_mode mode = GET_MODE (dest);
2642         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2643         rtx tp;
2644
2645         gcc_assert (mode == Pmode || mode == ptr_mode);
2646
2647         /* In ILP32, the got entry is always of SImode size.  Unlike
2648            small GOT, the dest is fixed at reg 0.  */
2649         if (TARGET_ILP32)
2650           emit_insn (gen_tlsdesc_small_si (imm));
2651         else
2652           emit_insn (gen_tlsdesc_small_di (imm));
2653         tp = aarch64_load_tp (NULL);
2654
2655         if (mode != Pmode)
2656           tp = gen_lowpart (mode, tp);
2657
2658         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2659         if (REG_P (dest))
2660           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2661         return;
2662       }
2663
2664     case SYMBOL_SMALL_TLSIE:
2665       {
2666         /* In ILP32, the mode of dest can be either SImode or DImode,
2667            while the got entry is always of SImode size.  The mode of
2668            dest depends on how dest is used: if dest is assigned to a
2669            pointer (e.g. in the memory), it has SImode; it may have
2670            DImode if dest is dereferenced to access the memeory.
2671            This is why we have to handle three different tlsie_small
2672            patterns here (two patterns for ILP32).  */
2673         machine_mode mode = GET_MODE (dest);
2674         rtx tmp_reg = gen_reg_rtx (mode);
2675         rtx tp = aarch64_load_tp (NULL);
2676
2677         if (mode == ptr_mode)
2678           {
2679             if (mode == DImode)
2680               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2681             else
2682               {
2683                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2684                 tp = gen_lowpart (mode, tp);
2685               }
2686           }
2687         else
2688           {
2689             gcc_assert (mode == Pmode);
2690             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2691           }
2692
2693         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2694         if (REG_P (dest))
2695           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2696         return;
2697       }
2698
2699     case SYMBOL_TLSLE12:
2700     case SYMBOL_TLSLE24:
2701     case SYMBOL_TLSLE32:
2702     case SYMBOL_TLSLE48:
2703       {
2704         machine_mode mode = GET_MODE (dest);
2705         rtx tp = aarch64_load_tp (NULL);
2706
2707         if (mode != Pmode)
2708           tp = gen_lowpart (mode, tp);
2709
2710         switch (type)
2711           {
2712           case SYMBOL_TLSLE12:
2713             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2714                         (dest, tp, imm));
2715             break;
2716           case SYMBOL_TLSLE24:
2717             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2718                         (dest, tp, imm));
2719           break;
2720           case SYMBOL_TLSLE32:
2721             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2722                         (dest, imm));
2723             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2724                         (dest, dest, tp));
2725           break;
2726           case SYMBOL_TLSLE48:
2727             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2728                         (dest, imm));
2729             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2730                         (dest, dest, tp));
2731             break;
2732           default:
2733             gcc_unreachable ();
2734           }
2735
2736         if (REG_P (dest))
2737           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2738         return;
2739       }
2740
2741     case SYMBOL_TINY_GOT:
2742       emit_insn (gen_ldr_got_tiny (dest, imm));
2743       return;
2744
2745     case SYMBOL_TINY_TLSIE:
2746       {
2747         machine_mode mode = GET_MODE (dest);
2748         rtx tp = aarch64_load_tp (NULL);
2749
2750         if (mode == ptr_mode)
2751           {
2752             if (mode == DImode)
2753               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2754             else
2755               {
2756                 tp = gen_lowpart (mode, tp);
2757                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2758               }
2759           }
2760         else
2761           {
2762             gcc_assert (mode == Pmode);
2763             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2764           }
2765
2766         if (REG_P (dest))
2767           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2768         return;
2769       }
2770
2771     default:
2772       gcc_unreachable ();
2773     }
2774 }
2775
2776 /* Emit a move from SRC to DEST.  Assume that the move expanders can
2777    handle all moves if !can_create_pseudo_p ().  The distinction is
2778    important because, unlike emit_move_insn, the move expanders know
2779    how to force Pmode objects into the constant pool even when the
2780    constant pool address is not itself legitimate.  */
2781 static rtx
2782 aarch64_emit_move (rtx dest, rtx src)
2783 {
2784   return (can_create_pseudo_p ()
2785           ? emit_move_insn (dest, src)
2786           : emit_move_insn_1 (dest, src));
2787 }
2788
2789 /* Apply UNOPTAB to OP and store the result in DEST.  */
2790
2791 static void
2792 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2793 {
2794   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2795   if (dest != tmp)
2796     emit_move_insn (dest, tmp);
2797 }
2798
2799 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
2800
2801 static void
2802 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2803 {
2804   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2805                           OPTAB_DIRECT);
2806   if (dest != tmp)
2807     emit_move_insn (dest, tmp);
2808 }
2809
2810 /* Split a 128-bit move operation into two 64-bit move operations,
2811    taking care to handle partial overlap of register to register
2812    copies.  Special cases are needed when moving between GP regs and
2813    FP regs.  SRC can be a register, constant or memory; DST a register
2814    or memory.  If either operand is memory it must not have any side
2815    effects.  */
2816 void
2817 aarch64_split_128bit_move (rtx dst, rtx src)
2818 {
2819   rtx dst_lo, dst_hi;
2820   rtx src_lo, src_hi;
2821
2822   machine_mode mode = GET_MODE (dst);
2823
2824   gcc_assert (mode == TImode || mode == TFmode);
2825   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2826   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2827
2828   if (REG_P (dst) && REG_P (src))
2829     {
2830       int src_regno = REGNO (src);
2831       int dst_regno = REGNO (dst);
2832
2833       /* Handle FP <-> GP regs.  */
2834       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2835         {
2836           src_lo = gen_lowpart (word_mode, src);
2837           src_hi = gen_highpart (word_mode, src);
2838
2839           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2840           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2841           return;
2842         }
2843       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2844         {
2845           dst_lo = gen_lowpart (word_mode, dst);
2846           dst_hi = gen_highpart (word_mode, dst);
2847
2848           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2849           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2850           return;
2851         }
2852     }
2853
2854   dst_lo = gen_lowpart (word_mode, dst);
2855   dst_hi = gen_highpart (word_mode, dst);
2856   src_lo = gen_lowpart (word_mode, src);
2857   src_hi = gen_highpart_mode (word_mode, mode, src);
2858
2859   /* At most one pairing may overlap.  */
2860   if (reg_overlap_mentioned_p (dst_lo, src_hi))
2861     {
2862       aarch64_emit_move (dst_hi, src_hi);
2863       aarch64_emit_move (dst_lo, src_lo);
2864     }
2865   else
2866     {
2867       aarch64_emit_move (dst_lo, src_lo);
2868       aarch64_emit_move (dst_hi, src_hi);
2869     }
2870 }
2871
2872 bool
2873 aarch64_split_128bit_move_p (rtx dst, rtx src)
2874 {
2875   return (! REG_P (src)
2876           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2877 }
2878
2879 /* Split a complex SIMD combine.  */
2880
2881 void
2882 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2883 {
2884   machine_mode src_mode = GET_MODE (src1);
2885   machine_mode dst_mode = GET_MODE (dst);
2886
2887   gcc_assert (VECTOR_MODE_P (dst_mode));
2888   gcc_assert (register_operand (dst, dst_mode)
2889               && register_operand (src1, src_mode)
2890               && register_operand (src2, src_mode));
2891
2892   emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2893   return;
2894 }
2895
2896 /* Split a complex SIMD move.  */
2897
2898 void
2899 aarch64_split_simd_move (rtx dst, rtx src)
2900 {
2901   machine_mode src_mode = GET_MODE (src);
2902   machine_mode dst_mode = GET_MODE (dst);
2903
2904   gcc_assert (VECTOR_MODE_P (dst_mode));
2905
2906   if (REG_P (dst) && REG_P (src))
2907     {
2908       gcc_assert (VECTOR_MODE_P (src_mode));
2909       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2910     }
2911 }
2912
2913 bool
2914 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2915                               machine_mode ymode, rtx y)
2916 {
2917   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2918   gcc_assert (r != NULL);
2919   return rtx_equal_p (x, r);
2920 }
2921
2922 /* Return TARGET if it is nonnull and a register of mode MODE.
2923    Otherwise, return a fresh register of mode MODE if we can,
2924    or TARGET reinterpreted as MODE if we can't.  */
2925
2926 static rtx
2927 aarch64_target_reg (rtx target, machine_mode mode)
2928 {
2929   if (target && REG_P (target) && GET_MODE (target) == mode)
2930     return target;
2931   if (!can_create_pseudo_p ())
2932     {
2933       gcc_assert (target);
2934       return gen_lowpart (mode, target);
2935     }
2936   return gen_reg_rtx (mode);
2937 }
2938
2939 /* Return a register that contains the constant in BUILDER, given that
2940    the constant is a legitimate move operand.  Use TARGET as the register
2941    if it is nonnull and convenient.  */
2942
2943 static rtx
2944 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
2945 {
2946   rtx src = builder.build ();
2947   target = aarch64_target_reg (target, GET_MODE (src));
2948   emit_insn (gen_rtx_SET (target, src));
2949   return target;
2950 }
2951
2952 static rtx
2953 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2954 {
2955   if (can_create_pseudo_p ())
2956     return force_reg (mode, value);
2957   else
2958     {
2959       gcc_assert (x);
2960       aarch64_emit_move (x, value);
2961       return x;
2962     }
2963 }
2964
2965 /* Return true if predicate value X is a constant in which every element
2966    is a CONST_INT.  When returning true, describe X in BUILDER as a VNx16BI
2967    value, i.e. as a predicate in which all bits are significant.  */
2968
2969 static bool
2970 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
2971 {
2972   if (GET_CODE (x) != CONST_VECTOR)
2973     return false;
2974
2975   unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
2976                                              GET_MODE_NUNITS (GET_MODE (x)));
2977   unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
2978   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
2979   builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
2980
2981   unsigned int nelts = const_vector_encoded_nelts (x);
2982   for (unsigned int i = 0; i < nelts; ++i)
2983     {
2984       rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
2985       if (!CONST_INT_P (elt))
2986         return false;
2987
2988       builder.quick_push (elt);
2989       for (unsigned int j = 1; j < factor; ++j)
2990         builder.quick_push (const0_rtx);
2991     }
2992   builder.finalize ();
2993   return true;
2994 }
2995
2996 /* BUILDER contains a predicate constant of mode VNx16BI.  Return the
2997    widest predicate element size it can have (that is, the largest size
2998    for which each element would still be 0 or 1).  */
2999
3000 unsigned int
3001 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
3002 {
3003   /* Start with the most optimistic assumption: that we only need
3004      one bit per pattern.  This is what we will use if only the first
3005      bit in each pattern is ever set.  */
3006   unsigned int mask = GET_MODE_SIZE (DImode);
3007   mask |= builder.npatterns ();
3008
3009   /* Look for set bits.  */
3010   unsigned int nelts = builder.encoded_nelts ();
3011   for (unsigned int i = 1; i < nelts; ++i)
3012     if (INTVAL (builder.elt (i)) != 0)
3013       {
3014         if (i & 1)
3015           return 1;
3016         mask |= i;
3017       }
3018   return mask & -mask;
3019 }
3020
3021 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
3022    return that predicate mode, otherwise return opt_machine_mode ().  */
3023
3024 opt_machine_mode
3025 aarch64_ptrue_all_mode (rtx x)
3026 {
3027   gcc_assert (GET_MODE (x) == VNx16BImode);
3028   if (GET_CODE (x) != CONST_VECTOR
3029       || !CONST_VECTOR_DUPLICATE_P (x)
3030       || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
3031       || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
3032     return opt_machine_mode ();
3033
3034   unsigned int nelts = const_vector_encoded_nelts (x);
3035   for (unsigned int i = 1; i < nelts; ++i)
3036     if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
3037       return opt_machine_mode ();
3038
3039   return aarch64_sve_pred_mode (nelts);
3040 }
3041
3042 /* BUILDER is a predicate constant of mode VNx16BI.  Consider the value
3043    that the constant would have with predicate element size ELT_SIZE
3044    (ignoring the upper bits in each element) and return:
3045
3046    * -1 if all bits are set
3047    * N if the predicate has N leading set bits followed by all clear bits
3048    * 0 if the predicate does not have any of these forms.  */
3049
3050 int
3051 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
3052                               unsigned int elt_size)
3053 {
3054   /* If nelts_per_pattern is 3, we have set bits followed by clear bits
3055      followed by set bits.  */
3056   if (builder.nelts_per_pattern () == 3)
3057     return 0;
3058
3059   /* Skip over leading set bits.  */
3060   unsigned int nelts = builder.encoded_nelts ();
3061   unsigned int i = 0;
3062   for (; i < nelts; i += elt_size)
3063     if (INTVAL (builder.elt (i)) == 0)
3064       break;
3065   unsigned int vl = i / elt_size;
3066
3067   /* Check for the all-true case.  */
3068   if (i == nelts)
3069     return -1;
3070
3071   /* If nelts_per_pattern is 1, then either VL is zero, or we have a
3072      repeating pattern of set bits followed by clear bits.  */
3073   if (builder.nelts_per_pattern () != 2)
3074     return 0;
3075
3076   /* We have a "foreground" value and a duplicated "background" value.
3077      If the background might repeat and the last set bit belongs to it,
3078      we might have set bits followed by clear bits followed by set bits.  */
3079   if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
3080     return 0;
3081
3082   /* Make sure that the rest are all clear.  */
3083   for (; i < nelts; i += elt_size)
3084     if (INTVAL (builder.elt (i)) != 0)
3085       return 0;
3086
3087   return vl;
3088 }
3089
3090 /* See if there is an svpattern that encodes an SVE predicate of mode
3091    PRED_MODE in which the first VL bits are set and the rest are clear.
3092    Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
3093    A VL of -1 indicates an all-true vector.  */
3094
3095 aarch64_svpattern
3096 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
3097 {
3098   if (vl < 0)
3099     return AARCH64_SV_ALL;
3100
3101   if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
3102     return AARCH64_NUM_SVPATTERNS;
3103
3104   if (vl >= 1 && vl <= 8)
3105     return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
3106
3107   if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
3108     return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
3109
3110   int max_vl;
3111   if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
3112     {
3113       if (vl == (max_vl / 3) * 3)
3114         return AARCH64_SV_MUL3;
3115       /* These would only trigger for non-power-of-2 lengths.  */
3116       if (vl == (max_vl & -4))
3117         return AARCH64_SV_MUL4;
3118       if (vl == (1 << floor_log2 (max_vl)))
3119         return AARCH64_SV_POW2;
3120       if (vl == max_vl)
3121         return AARCH64_SV_ALL;
3122     }
3123   return AARCH64_NUM_SVPATTERNS;
3124 }
3125
3126 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
3127    bits has the lowest bit set and the upper bits clear.  This is the
3128    VNx16BImode equivalent of a PTRUE for controlling elements of
3129    ELT_SIZE bytes.  However, because the constant is VNx16BImode,
3130    all bits are significant, even the upper zeros.  */
3131
3132 rtx
3133 aarch64_ptrue_all (unsigned int elt_size)
3134 {
3135   rtx_vector_builder builder (VNx16BImode, elt_size, 1);
3136   builder.quick_push (const1_rtx);
3137   for (unsigned int i = 1; i < elt_size; ++i)
3138     builder.quick_push (const0_rtx);
3139   return builder.build ();
3140 }
3141
3142 /* Return an all-true predicate register of mode MODE.  */
3143
3144 rtx
3145 aarch64_ptrue_reg (machine_mode mode)
3146 {
3147   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
3148   rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3149   return gen_lowpart (mode, reg);
3150 }
3151
3152 /* Return an all-false predicate register of mode MODE.  */
3153
3154 rtx
3155 aarch64_pfalse_reg (machine_mode mode)
3156 {
3157   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
3158   rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
3159   return gen_lowpart (mode, reg);
3160 }
3161
3162 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
3163    true, or alternatively if we know that the operation predicated by
3164    PRED1[0] is safe to perform whenever PRED2 is true.  PRED1[1] is a
3165    aarch64_sve_gp_strictness operand that describes the operation
3166    predicated by PRED1[0].  */
3167
3168 bool
3169 aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
3170 {
3171   machine_mode mode = GET_MODE (pred2);
3172   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3173               && mode == GET_MODE (pred1[0])
3174               && aarch64_sve_gp_strictness (pred1[1], SImode));
3175   return (pred1[0] == CONSTM1_RTX (mode)
3176           || INTVAL (pred1[1]) == SVE_RELAXED_GP
3177           || rtx_equal_p (pred1[0], pred2));
3178 }
3179
3180 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
3181    for it.  PRED2[0] is the predicate for the instruction whose result
3182    is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
3183    for it.  Return true if we can prove that the two predicates are
3184    equivalent for PTEST purposes; that is, if we can replace PRED2[0]
3185    with PRED1[0] without changing behavior.  */
3186
3187 bool
3188 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
3189 {
3190   machine_mode mode = GET_MODE (pred1[0]);
3191   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3192               && mode == GET_MODE (pred2[0])
3193               && aarch64_sve_ptrue_flag (pred1[1], SImode)
3194               && aarch64_sve_ptrue_flag (pred2[1], SImode));
3195
3196   bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
3197                    || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
3198   bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
3199                    || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
3200   return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
3201 }
3202
3203 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
3204    DATA_MODE, and return the result in a predicate of mode PRED_MODE.
3205    Use TARGET as the target register if nonnull and convenient.  */
3206
3207 static rtx
3208 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
3209                           machine_mode data_mode, rtx op1, rtx op2)
3210 {
3211   insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
3212   expand_operand ops[5];
3213   create_output_operand (&ops[0], target, pred_mode);
3214   create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
3215   create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
3216   create_input_operand (&ops[3], op1, data_mode);
3217   create_input_operand (&ops[4], op2, data_mode);
3218   expand_insn (icode, 5, ops);
3219   return ops[0].value;
3220 }
3221
3222 /* Use a comparison to convert integer vector SRC into MODE, which is
3223    the corresponding SVE predicate mode.  Use TARGET for the result
3224    if it's nonnull and convenient.  */
3225
3226 rtx
3227 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
3228 {
3229   machine_mode src_mode = GET_MODE (src);
3230   return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
3231                                    src, CONST0_RTX (src_mode));
3232 }
3233
3234 /* Return the assembly token for svprfop value PRFOP.  */
3235
3236 static const char *
3237 svprfop_token (enum aarch64_svprfop prfop)
3238 {
3239   switch (prfop)
3240     {
3241 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
3242     AARCH64_FOR_SVPRFOP (CASE)
3243 #undef CASE
3244     case AARCH64_NUM_SVPRFOPS:
3245       break;
3246     }
3247   gcc_unreachable ();
3248 }
3249
3250 /* Return the assembly string for an SVE prefetch operation with
3251    mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
3252    and that SUFFIX is the format for the remaining operands.  */
3253
3254 char *
3255 aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
3256                              const char *suffix)
3257 {
3258   static char buffer[128];
3259   aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
3260   unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
3261                                    mnemonic, svprfop_token (prfop), suffix);
3262   gcc_assert (written < sizeof (buffer));
3263   return buffer;
3264 }
3265
3266 /* Check whether we can calculate the number of elements in PATTERN
3267    at compile time, given that there are NELTS_PER_VQ elements per
3268    128-bit block.  Return the value if so, otherwise return -1.  */
3269
3270 HOST_WIDE_INT
3271 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
3272 {
3273   unsigned int vl, const_vg;
3274   if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
3275     vl = 1 + (pattern - AARCH64_SV_VL1);
3276   else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
3277     vl = 16 << (pattern - AARCH64_SV_VL16);
3278   else if (aarch64_sve_vg.is_constant (&const_vg))
3279     {
3280       /* There are two vector granules per quadword.  */
3281       unsigned int nelts = (const_vg / 2) * nelts_per_vq;
3282       switch (pattern)
3283         {
3284         case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
3285         case AARCH64_SV_MUL4: return nelts & -4;
3286         case AARCH64_SV_MUL3: return (nelts / 3) * 3;
3287         case AARCH64_SV_ALL: return nelts;
3288         default: gcc_unreachable ();
3289         }
3290     }
3291   else
3292     return -1;
3293
3294   /* There are two vector granules per quadword.  */
3295   poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
3296   if (known_le (vl, nelts_all))
3297     return vl;
3298
3299   /* Requesting more elements than are available results in a PFALSE.  */
3300   if (known_gt (vl, nelts_all))
3301     return 0;
3302
3303   return -1;
3304 }
3305
3306 /* Return true if we can move VALUE into a register using a single
3307    CNT[BHWD] instruction.  */
3308
3309 static bool
3310 aarch64_sve_cnt_immediate_p (poly_int64 value)
3311 {
3312   HOST_WIDE_INT factor = value.coeffs[0];
3313   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
3314   return (value.coeffs[1] == factor
3315           && IN_RANGE (factor, 2, 16 * 16)
3316           && (factor & 1) == 0
3317           && factor <= 16 * (factor & -factor));
3318 }
3319
3320 /* Likewise for rtx X.  */
3321
3322 bool
3323 aarch64_sve_cnt_immediate_p (rtx x)
3324 {
3325   poly_int64 value;
3326   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
3327 }
3328
3329 /* Return the asm string for an instruction with a CNT-like vector size
3330    operand (a vector pattern followed by a multiplier in the range [1, 16]).
3331    PREFIX is the mnemonic without the size suffix and OPERANDS is the
3332    first part of the operands template (the part that comes before the
3333    vector size itself).  PATTERN is the pattern to use.  FACTOR is the
3334    number of quadwords.  NELTS_PER_VQ, if nonzero, is the number of elements
3335    in each quadword.  If it is zero, we can use any element size.  */
3336
3337 static char *
3338 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3339                                   aarch64_svpattern pattern,
3340                                   unsigned int factor,
3341                                   unsigned int nelts_per_vq)
3342 {
3343   static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
3344
3345   if (nelts_per_vq == 0)
3346     /* There is some overlap in the ranges of the four CNT instructions.
3347        Here we always use the smallest possible element size, so that the
3348        multiplier is 1 whereever possible.  */
3349     nelts_per_vq = factor & -factor;
3350   int shift = std::min (exact_log2 (nelts_per_vq), 4);
3351   gcc_assert (IN_RANGE (shift, 1, 4));
3352   char suffix = "dwhb"[shift - 1];
3353
3354   factor >>= shift;
3355   unsigned int written;
3356   if (pattern == AARCH64_SV_ALL && factor == 1)
3357     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
3358                         prefix, suffix, operands);
3359   else if (factor == 1)
3360     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
3361                         prefix, suffix, operands, svpattern_token (pattern));
3362   else
3363     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
3364                         prefix, suffix, operands, svpattern_token (pattern),
3365                         factor);
3366   gcc_assert (written < sizeof (buffer));
3367   return buffer;
3368 }
3369
3370 /* Return the asm string for an instruction with a CNT-like vector size
3371    operand (a vector pattern followed by a multiplier in the range [1, 16]).
3372    PREFIX is the mnemonic without the size suffix and OPERANDS is the
3373    first part of the operands template (the part that comes before the
3374    vector size itself).  X is the value of the vector size operand,
3375    as a polynomial integer rtx; we need to convert this into an "all"
3376    pattern with a multiplier.  */
3377
3378 char *
3379 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3380                                   rtx x)
3381 {
3382   poly_int64 value = rtx_to_poly_int64 (x);
3383   gcc_assert (aarch64_sve_cnt_immediate_p (value));
3384   return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
3385                                            value.coeffs[1], 0);
3386 }
3387
3388 /* Return the asm string for an instruction with a CNT-like vector size
3389    operand (a vector pattern followed by a multiplier in the range [1, 16]).
3390    PREFIX is the mnemonic without the size suffix and OPERANDS is the
3391    first part of the operands template (the part that comes before the
3392    vector size itself).  CNT_PAT[0..2] are the operands of the
3393    UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details.  */
3394
3395 char *
3396 aarch64_output_sve_cnt_pat_immediate (const char *prefix,
3397                                       const char *operands, rtx *cnt_pat)
3398 {
3399   aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
3400   unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
3401   unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
3402   return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
3403                                            factor, nelts_per_vq);
3404 }
3405
3406 /* Return true if we can add X using a single SVE INC or DEC instruction.  */
3407
3408 bool
3409 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
3410 {
3411   poly_int64 value;
3412   return (poly_int_rtx_p (x, &value)
3413           && (aarch64_sve_cnt_immediate_p (value)
3414               || aarch64_sve_cnt_immediate_p (-value)));
3415 }
3416
3417 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
3418    operand 0.  */
3419
3420 char *
3421 aarch64_output_sve_scalar_inc_dec (rtx offset)
3422 {
3423   poly_int64 offset_value = rtx_to_poly_int64 (offset);
3424   gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
3425   if (offset_value.coeffs[1] > 0)
3426     return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
3427                                              offset_value.coeffs[1], 0);
3428   else
3429     return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
3430                                              -offset_value.coeffs[1], 0);
3431 }
3432
3433 /* Return true if we can add VALUE to a register using a single ADDVL
3434    or ADDPL instruction.  */
3435
3436 static bool
3437 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
3438 {
3439   HOST_WIDE_INT factor = value.coeffs[0];
3440   if (factor == 0 || value.coeffs[1] != factor)
3441     return false;
3442   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
3443      and a value of 16 is one vector width.  */
3444   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
3445           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
3446 }
3447
3448 /* Likewise for rtx X.  */
3449
3450 bool
3451 aarch64_sve_addvl_addpl_immediate_p (rtx x)
3452 {
3453   poly_int64 value;
3454   return (poly_int_rtx_p (x, &value)
3455           && aarch64_sve_addvl_addpl_immediate_p (value));
3456 }
3457
3458 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3459    to operand 1 and storing the result in operand 0.  */
3460
3461 char *
3462 aarch64_output_sve_addvl_addpl (rtx offset)
3463 {
3464   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3465   poly_int64 offset_value = rtx_to_poly_int64 (offset);
3466   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
3467
3468   int factor = offset_value.coeffs[1];
3469   if ((factor & 15) == 0)
3470     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
3471   else
3472     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
3473   return buffer;
3474 }
3475
3476 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3477    instruction.  If it is, store the number of elements in each vector
3478    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3479    factor in *FACTOR_OUT (if nonnull).  */
3480
3481 bool
3482 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
3483                                         unsigned int *nelts_per_vq_out)
3484 {
3485   rtx elt;
3486   poly_int64 value;
3487
3488   if (!const_vec_duplicate_p (x, &elt)
3489       || !poly_int_rtx_p (elt, &value))
3490     return false;
3491
3492   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
3493   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
3494     /* There's no vector INCB.  */
3495     return false;
3496
3497   HOST_WIDE_INT factor = value.coeffs[0];
3498   if (value.coeffs[1] != factor)
3499     return false;
3500
3501   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
3502   if ((factor % nelts_per_vq) != 0
3503       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
3504     return false;
3505
3506   if (factor_out)
3507     *factor_out = factor;
3508   if (nelts_per_vq_out)
3509     *nelts_per_vq_out = nelts_per_vq;
3510   return true;
3511 }
3512
3513 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3514    instruction.  */
3515
3516 bool
3517 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
3518 {
3519   return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
3520 }
3521
3522 /* Return the asm template for an SVE vector INC or DEC instruction.
3523    OPERANDS gives the operands before the vector count and X is the
3524    value of the vector count operand itself.  */
3525
3526 char *
3527 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
3528 {
3529   int factor;
3530   unsigned int nelts_per_vq;
3531   if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
3532     gcc_unreachable ();
3533   if (factor < 0)
3534     return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
3535                                              -factor, nelts_per_vq);
3536   else
3537     return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
3538                                              factor, nelts_per_vq);
3539 }
3540
3541 static int
3542 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
3543                                 scalar_int_mode mode)
3544 {
3545   int i;
3546   unsigned HOST_WIDE_INT val, val2, mask;
3547   int one_match, zero_match;
3548   int num_insns;
3549
3550   val = INTVAL (imm);
3551
3552   if (aarch64_move_imm (val, mode))
3553     {
3554       if (generate)
3555         emit_insn (gen_rtx_SET (dest, imm));
3556       return 1;
3557     }
3558
3559   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3560      (with XXXX non-zero). In that case check to see if the move can be done in
3561      a smaller mode.  */
3562   val2 = val & 0xffffffff;
3563   if (mode == DImode
3564       && aarch64_move_imm (val2, SImode)
3565       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
3566     {
3567       if (generate)
3568         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3569
3570       /* Check if we have to emit a second instruction by checking to see
3571          if any of the upper 32 bits of the original DI mode value is set.  */
3572       if (val == val2)
3573         return 1;
3574
3575       i = (val >> 48) ? 48 : 32;
3576
3577       if (generate)
3578          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3579                                     GEN_INT ((val >> i) & 0xffff)));
3580
3581       return 2;
3582     }
3583
3584   if ((val >> 32) == 0 || mode == SImode)
3585     {
3586       if (generate)
3587         {
3588           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
3589           if (mode == SImode)
3590             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
3591                                        GEN_INT ((val >> 16) & 0xffff)));
3592           else
3593             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
3594                                        GEN_INT ((val >> 16) & 0xffff)));
3595         }
3596       return 2;
3597     }
3598
3599   /* Remaining cases are all for DImode.  */
3600
3601   mask = 0xffff;
3602   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
3603     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
3604   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
3605     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
3606
3607   if (zero_match != 2 && one_match != 2)
3608     {
3609       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3610          For a 64-bit bitmask try whether changing 16 bits to all ones or
3611          zeroes creates a valid bitmask.  To check any repeated bitmask,
3612          try using 16 bits from the other 32-bit half of val.  */
3613
3614       for (i = 0; i < 64; i += 16, mask <<= 16)
3615         {
3616           val2 = val & ~mask;
3617           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3618             break;
3619           val2 = val | mask;
3620           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3621             break;
3622           val2 = val2 & ~mask;
3623           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
3624           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3625             break;
3626         }
3627       if (i != 64)
3628         {
3629           if (generate)
3630             {
3631               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3632               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3633                                          GEN_INT ((val >> i) & 0xffff)));
3634             }
3635           return 2;
3636         }
3637     }
3638
3639   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3640      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
3641      otherwise skip zero bits.  */
3642
3643   num_insns = 1;
3644   mask = 0xffff;
3645   val2 = one_match > zero_match ? ~val : val;
3646   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
3647
3648   if (generate)
3649     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
3650                                            ? (val | ~(mask << i))
3651                                            : (val & (mask << i)))));
3652   for (i += 16; i < 64; i += 16)
3653     {
3654       if ((val2 & (mask << i)) == 0)
3655         continue;
3656       if (generate)
3657         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3658                                    GEN_INT ((val >> i) & 0xffff)));
3659       num_insns ++;
3660     }
3661
3662   return num_insns;
3663 }
3664
3665 /* Return whether imm is a 128-bit immediate which is simple enough to
3666    expand inline.  */
3667 bool
3668 aarch64_mov128_immediate (rtx imm)
3669 {
3670   if (GET_CODE (imm) == CONST_INT)
3671     return true;
3672
3673   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
3674
3675   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
3676   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
3677
3678   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
3679          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
3680 }
3681
3682
3683 /* Return the number of temporary registers that aarch64_add_offset_1
3684    would need to add OFFSET to a register.  */
3685
3686 static unsigned int
3687 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
3688 {
3689   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
3690 }
3691
3692 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
3693    a non-polynomial OFFSET.  MODE is the mode of the addition.
3694    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3695    be set and CFA adjustments added to the generated instructions.
3696
3697    TEMP1, if nonnull, is a register of mode MODE that can be used as a
3698    temporary if register allocation is already complete.  This temporary
3699    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
3700    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3701    the immediate again.
3702
3703    Since this function may be used to adjust the stack pointer, we must
3704    ensure that it cannot cause transient stack deallocation (for example
3705    by first incrementing SP and then decrementing when adjusting by a
3706    large immediate).  */
3707
3708 static void
3709 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
3710                       rtx src, HOST_WIDE_INT offset, rtx temp1,
3711                       bool frame_related_p, bool emit_move_imm)
3712 {
3713   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3714   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3715
3716   unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
3717   rtx_insn *insn;
3718
3719   if (!moffset)
3720     {
3721       if (!rtx_equal_p (dest, src))
3722         {
3723           insn = emit_insn (gen_rtx_SET (dest, src));
3724           RTX_FRAME_RELATED_P (insn) = frame_related_p;
3725         }
3726       return;
3727     }
3728
3729   /* Single instruction adjustment.  */
3730   if (aarch64_uimm12_shift (moffset))
3731     {
3732       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
3733       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3734       return;
3735     }
3736
3737   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3738      and either:
3739
3740      a) the offset cannot be loaded by a 16-bit move or
3741      b) there is no spare register into which we can move it.  */
3742   if (moffset < 0x1000000
3743       && ((!temp1 && !can_create_pseudo_p ())
3744           || !aarch64_move_imm (moffset, mode)))
3745     {
3746       HOST_WIDE_INT low_off = moffset & 0xfff;
3747
3748       low_off = offset < 0 ? -low_off : low_off;
3749       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
3750       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3751       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
3752       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3753       return;
3754     }
3755
3756   /* Emit a move immediate if required and an addition/subtraction.  */
3757   if (emit_move_imm)
3758     {
3759       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
3760       temp1 = aarch64_force_temporary (mode, temp1,
3761                                        gen_int_mode (moffset, mode));
3762     }
3763   insn = emit_insn (offset < 0
3764                     ? gen_sub3_insn (dest, src, temp1)
3765                     : gen_add3_insn (dest, src, temp1));
3766   if (frame_related_p)
3767     {
3768       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3769       rtx adj = plus_constant (mode, src, offset);
3770       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
3771     }
3772 }
3773
3774 /* Return the number of temporary registers that aarch64_add_offset
3775    would need to move OFFSET into a register or add OFFSET to a register;
3776    ADD_P is true if we want the latter rather than the former.  */
3777
3778 static unsigned int
3779 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
3780 {
3781   /* This follows the same structure as aarch64_add_offset.  */
3782   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
3783     return 0;
3784
3785   unsigned int count = 0;
3786   HOST_WIDE_INT factor = offset.coeffs[1];
3787   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3788   poly_int64 poly_offset (factor, factor);
3789   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3790     /* Need one register for the ADDVL/ADDPL result.  */
3791     count += 1;
3792   else if (factor != 0)
3793     {
3794       factor = abs (factor);
3795       if (factor > 16 * (factor & -factor))
3796         /* Need one register for the CNT result and one for the multiplication
3797            factor.  If necessary, the second temporary can be reused for the
3798            constant part of the offset.  */
3799         return 2;
3800       /* Need one register for the CNT result (which might then
3801          be shifted).  */
3802       count += 1;
3803     }
3804   return count + aarch64_add_offset_1_temporaries (constant);
3805 }
3806
3807 /* If X can be represented as a poly_int64, return the number
3808    of temporaries that are required to add it to a register.
3809    Return -1 otherwise.  */
3810
3811 int
3812 aarch64_add_offset_temporaries (rtx x)
3813 {
3814   poly_int64 offset;
3815   if (!poly_int_rtx_p (x, &offset))
3816     return -1;
3817   return aarch64_offset_temporaries (true, offset);
3818 }
3819
3820 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
3821    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3822    be set and CFA adjustments added to the generated instructions.
3823
3824    TEMP1, if nonnull, is a register of mode MODE that can be used as a
3825    temporary if register allocation is already complete.  This temporary
3826    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3827    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3828    false to avoid emitting the immediate again.
3829
3830    TEMP2, if nonnull, is a second temporary register that doesn't
3831    overlap either DEST or REG.
3832
3833    Since this function may be used to adjust the stack pointer, we must
3834    ensure that it cannot cause transient stack deallocation (for example
3835    by first incrementing SP and then decrementing when adjusting by a
3836    large immediate).  */
3837
3838 static void
3839 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3840                     poly_int64 offset, rtx temp1, rtx temp2,
3841                     bool frame_related_p, bool emit_move_imm = true)
3842 {
3843   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3844   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3845   gcc_assert (temp1 == NULL_RTX
3846               || !frame_related_p
3847               || !reg_overlap_mentioned_p (temp1, dest));
3848   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
3849
3850   /* Try using ADDVL or ADDPL to add the whole value.  */
3851   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
3852     {
3853       rtx offset_rtx = gen_int_mode (offset, mode);
3854       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3855       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3856       return;
3857     }
3858
3859   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3860      SVE vector register, over and above the minimum size of 128 bits.
3861      This is equivalent to half the value returned by CNTD with a
3862      vector shape of ALL.  */
3863   HOST_WIDE_INT factor = offset.coeffs[1];
3864   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3865
3866   /* Try using ADDVL or ADDPL to add the VG-based part.  */
3867   poly_int64 poly_offset (factor, factor);
3868   if (src != const0_rtx
3869       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3870     {
3871       rtx offset_rtx = gen_int_mode (poly_offset, mode);
3872       if (frame_related_p)
3873         {
3874           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3875           RTX_FRAME_RELATED_P (insn) = true;
3876           src = dest;
3877         }
3878       else
3879         {
3880           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
3881           src = aarch64_force_temporary (mode, temp1, addr);
3882           temp1 = temp2;
3883           temp2 = NULL_RTX;
3884         }
3885     }
3886   /* Otherwise use a CNT-based sequence.  */
3887   else if (factor != 0)
3888     {
3889       /* Use a subtraction if we have a negative factor.  */
3890       rtx_code code = PLUS;
3891       if (factor < 0)
3892         {
3893           factor = -factor;
3894           code = MINUS;
3895         }
3896
3897       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
3898          into the multiplication.  */
3899       rtx val;
3900       int shift = 0;
3901       if (factor & 1)
3902         /* Use a right shift by 1.  */
3903         shift = -1;
3904       else
3905         factor /= 2;
3906       HOST_WIDE_INT low_bit = factor & -factor;
3907       if (factor <= 16 * low_bit)
3908         {
3909           if (factor > 16 * 8)
3910             {
3911               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3912                  the value with the minimum multiplier and shift it into
3913                  position.  */
3914               int extra_shift = exact_log2 (low_bit);
3915               shift += extra_shift;
3916               factor >>= extra_shift;
3917             }
3918           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3919         }
3920       else
3921         {
3922           /* Base the factor on LOW_BIT if we can calculate LOW_BIT
3923              directly, since that should increase the chances of being
3924              able to use a shift and add sequence.  If LOW_BIT itself
3925              is out of range, just use CNTD.  */
3926           if (low_bit <= 16 * 8)
3927             factor /= low_bit;
3928           else
3929             low_bit = 1;
3930
3931           val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
3932           val = aarch64_force_temporary (mode, temp1, val);
3933
3934           if (can_create_pseudo_p ())
3935             {
3936               rtx coeff1 = gen_int_mode (factor, mode);
3937               val = expand_mult (mode, val, coeff1, NULL_RTX, false, true);
3938             }
3939           else
3940             {
3941               /* Go back to using a negative multiplication factor if we have
3942                  no register from which to subtract.  */
3943               if (code == MINUS && src == const0_rtx)
3944                 {
3945                   factor = -factor;
3946                   code = PLUS;
3947                 }
3948               rtx coeff1 = gen_int_mode (factor, mode);
3949               coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3950               val = gen_rtx_MULT (mode, val, coeff1);
3951             }
3952         }
3953
3954       if (shift > 0)
3955         {
3956           /* Multiply by 1 << SHIFT.  */
3957           val = aarch64_force_temporary (mode, temp1, val);
3958           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3959         }
3960       else if (shift == -1)
3961         {
3962           /* Divide by 2.  */
3963           val = aarch64_force_temporary (mode, temp1, val);
3964           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3965         }
3966
3967       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
3968       if (src != const0_rtx)
3969         {
3970           val = aarch64_force_temporary (mode, temp1, val);
3971           val = gen_rtx_fmt_ee (code, mode, src, val);
3972         }
3973       else if (code == MINUS)
3974         {
3975           val = aarch64_force_temporary (mode, temp1, val);
3976           val = gen_rtx_NEG (mode, val);
3977         }
3978
3979       if (constant == 0 || frame_related_p)
3980         {
3981           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3982           if (frame_related_p)
3983             {
3984               RTX_FRAME_RELATED_P (insn) = true;
3985               add_reg_note (insn, REG_CFA_ADJUST_CFA,
3986                             gen_rtx_SET (dest, plus_constant (Pmode, src,
3987                                                               poly_offset)));
3988             }
3989           src = dest;
3990           if (constant == 0)
3991             return;
3992         }
3993       else
3994         {
3995           src = aarch64_force_temporary (mode, temp1, val);
3996           temp1 = temp2;
3997           temp2 = NULL_RTX;
3998         }
3999
4000       emit_move_imm = true;
4001     }
4002
4003   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
4004                         frame_related_p, emit_move_imm);
4005 }
4006
4007 /* Like aarch64_add_offset, but the offset is given as an rtx rather
4008    than a poly_int64.  */
4009
4010 void
4011 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4012                           rtx offset_rtx, rtx temp1, rtx temp2)
4013 {
4014   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
4015                       temp1, temp2, false);
4016 }
4017
4018 /* Add DELTA to the stack pointer, marking the instructions frame-related.
4019    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
4020    if TEMP1 already contains abs (DELTA).  */
4021
4022 static inline void
4023 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
4024 {
4025   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
4026                       temp1, temp2, true, emit_move_imm);
4027 }
4028
4029 /* Subtract DELTA from the stack pointer, marking the instructions
4030    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
4031    if nonnull.  */
4032
4033 static inline void
4034 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
4035                 bool emit_move_imm = true)
4036 {
4037   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
4038                       temp1, temp2, frame_related_p, emit_move_imm);
4039 }
4040
4041 /* Set DEST to (vec_series BASE STEP).  */
4042
4043 static void
4044 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
4045 {
4046   machine_mode mode = GET_MODE (dest);
4047   scalar_mode inner = GET_MODE_INNER (mode);
4048
4049   /* Each operand can be a register or an immediate in the range [-16, 15].  */
4050   if (!aarch64_sve_index_immediate_p (base))
4051     base = force_reg (inner, base);
4052   if (!aarch64_sve_index_immediate_p (step))
4053     step = force_reg (inner, step);
4054
4055   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
4056 }
4057
4058 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
4059    register of mode MODE.  Use TARGET for the result if it's nonnull
4060    and convenient.
4061
4062    The two vector modes must have the same element mode.  The behavior
4063    is to duplicate architectural lane N of SRC into architectural lanes
4064    N + I * STEP of the result.  On big-endian targets, architectural
4065    lane 0 of an Advanced SIMD vector is the last element of the vector
4066    in memory layout, so for big-endian targets this operation has the
4067    effect of reversing SRC before duplicating it.  Callers need to
4068    account for this.  */
4069
4070 rtx
4071 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
4072 {
4073   machine_mode src_mode = GET_MODE (src);
4074   gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
4075   insn_code icode = (BYTES_BIG_ENDIAN
4076                      ? code_for_aarch64_vec_duplicate_vq_be (mode)
4077                      : code_for_aarch64_vec_duplicate_vq_le (mode));
4078
4079   unsigned int i = 0;
4080   expand_operand ops[3];
4081   create_output_operand (&ops[i++], target, mode);
4082   create_output_operand (&ops[i++], src, src_mode);
4083   if (BYTES_BIG_ENDIAN)
4084     {
4085       /* Create a PARALLEL describing the reversal of SRC.  */
4086       unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
4087       rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
4088                                                   nelts_per_vq - 1, -1);
4089       create_fixed_operand (&ops[i++], sel);
4090     }
4091   expand_insn (icode, i, ops);
4092   return ops[0].value;
4093 }
4094
4095 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
4096    the memory image into DEST.  Return true on success.  */
4097
4098 static bool
4099 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
4100 {
4101   src = force_const_mem (GET_MODE (src), src);
4102   if (!src)
4103     return false;
4104
4105   /* Make sure that the address is legitimate.  */
4106   if (!aarch64_sve_ld1rq_operand_p (src))
4107     {
4108       rtx addr = force_reg (Pmode, XEXP (src, 0));
4109       src = replace_equiv_address (src, addr);
4110     }
4111
4112   machine_mode mode = GET_MODE (dest);
4113   machine_mode pred_mode = aarch64_sve_pred_mode (mode);
4114   rtx ptrue = aarch64_ptrue_reg (pred_mode);
4115   emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
4116   return true;
4117 }
4118
4119 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
4120    SVE data mode and isn't a legitimate constant.  Use TARGET for the
4121    result if convenient.
4122
4123    The returned register can have whatever mode seems most natural
4124    given the contents of SRC.  */
4125
4126 static rtx
4127 aarch64_expand_sve_const_vector (rtx target, rtx src)
4128 {
4129   machine_mode mode = GET_MODE (src);
4130   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
4131   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
4132   scalar_mode elt_mode = GET_MODE_INNER (mode);
4133   unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
4134   unsigned int container_bits = aarch64_sve_container_bits (mode);
4135   unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
4136
4137   if (nelts_per_pattern == 1
4138       && encoded_bits <= 128
4139       && container_bits != elt_bits)
4140     {
4141       /* We have a partial vector mode and a constant whose full-vector
4142          equivalent would occupy a repeating 128-bit sequence.  Build that
4143          full-vector equivalent instead, so that we have the option of
4144          using LD1RQ and Advanced SIMD operations.  */
4145       unsigned int repeat = container_bits / elt_bits;
4146       machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
4147       rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
4148       for (unsigned int i = 0; i < npatterns; ++i)
4149         for (unsigned int j = 0; j < repeat; ++j)
4150           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
4151       target = aarch64_target_reg (target, full_mode);
4152       return aarch64_expand_sve_const_vector (target, builder.build ());
4153     }
4154
4155   if (nelts_per_pattern == 1 && encoded_bits == 128)
4156     {
4157       /* The constant is a duplicated quadword but can't be narrowed
4158          beyond a quadword.  Get the memory image of the first quadword
4159          as a 128-bit vector and try using LD1RQ to load it from memory.
4160
4161          The effect for both endiannesses is to load memory lane N into
4162          architectural lanes N + I * STEP of the result.  On big-endian
4163          targets, the layout of the 128-bit vector in an Advanced SIMD
4164          register would be different from its layout in an SVE register,
4165          but this 128-bit vector is a memory value only.  */
4166       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
4167       rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
4168       if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
4169         return target;
4170     }
4171
4172   if (nelts_per_pattern == 1 && encoded_bits < 128)
4173     {
4174       /* The vector is a repeating sequence of 64 bits or fewer.
4175          See if we can load them using an Advanced SIMD move and then
4176          duplicate it to fill a vector.  This is better than using a GPR
4177          move because it keeps everything in the same register file.  */
4178       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
4179       rtx_vector_builder builder (vq_mode, npatterns, 1);
4180       for (unsigned int i = 0; i < npatterns; ++i)
4181         {
4182           /* We want memory lane N to go into architectural lane N,
4183              so reverse for big-endian targets.  The DUP .Q pattern
4184              has a compensating reverse built-in.  */
4185           unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
4186           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
4187         }
4188       rtx vq_src = builder.build ();
4189       if (aarch64_simd_valid_immediate (vq_src, NULL))
4190         {
4191           vq_src = force_reg (vq_mode, vq_src);
4192           return aarch64_expand_sve_dupq (target, mode, vq_src);
4193         }
4194
4195       /* Get an integer representation of the repeating part of Advanced
4196          SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
4197          which for big-endian targets is lane-swapped wrt a normal
4198          Advanced SIMD vector.  This means that for both endiannesses,
4199          memory lane N of SVE vector SRC corresponds to architectural
4200          lane N of a register holding VQ_SRC.  This in turn means that
4201          memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
4202          as a single 128-bit value) and thus that memory lane 0 of SRC is
4203          in the lsb of the integer.  Duplicating the integer therefore
4204          ensures that memory lane N of SRC goes into architectural lane
4205          N + I * INDEX of the SVE register.  */
4206       scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
4207       rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
4208       if (elt_value)
4209         {
4210           /* Pretend that we had a vector of INT_MODE to start with.  */
4211           elt_mode = int_mode;
4212           mode = aarch64_full_sve_mode (int_mode).require ();
4213
4214           /* If the integer can be moved into a general register by a
4215              single instruction, do that and duplicate the result.  */
4216           if (CONST_INT_P (elt_value)
4217               && aarch64_move_imm (INTVAL (elt_value), elt_mode))
4218             {
4219               elt_value = force_reg (elt_mode, elt_value);
4220               return expand_vector_broadcast (mode, elt_value);
4221             }
4222         }
4223       else if (npatterns == 1)
4224         /* We're duplicating a single value, but can't do better than
4225            force it to memory and load from there.  This handles things
4226            like symbolic constants.  */
4227         elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
4228
4229       if (elt_value)
4230         {
4231           /* Load the element from memory if we can, otherwise move it into
4232              a register and use a DUP.  */
4233           rtx op = force_const_mem (elt_mode, elt_value);
4234           if (!op)
4235             op = force_reg (elt_mode, elt_value);
4236           return expand_vector_broadcast (mode, op);
4237         }
4238     }
4239
4240   /* Try using INDEX.  */
4241   rtx base, step;
4242   if (const_vec_series_p (src, &base, &step))
4243     {
4244       aarch64_expand_vec_series (target, base, step);
4245       return target;
4246     }
4247
4248   /* From here on, it's better to force the whole constant to memory
4249      if we can.  */
4250   if (GET_MODE_NUNITS (mode).is_constant ())
4251     return NULL_RTX;
4252
4253   /* Expand each pattern individually.  */
4254   gcc_assert (npatterns > 1);
4255   rtx_vector_builder builder;
4256   auto_vec<rtx, 16> vectors (npatterns);
4257   for (unsigned int i = 0; i < npatterns; ++i)
4258     {
4259       builder.new_vector (mode, 1, nelts_per_pattern);
4260       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
4261         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
4262       vectors.quick_push (force_reg (mode, builder.build ()));
4263     }
4264
4265   /* Use permutes to interleave the separate vectors.  */
4266   while (npatterns > 1)
4267     {
4268       npatterns /= 2;
4269       for (unsigned int i = 0; i < npatterns; ++i)
4270         {
4271           rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
4272           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
4273           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
4274           vectors[i] = tmp;
4275         }
4276     }
4277   gcc_assert (vectors[0] == target);
4278   return target;
4279 }
4280
4281 /* Use WHILE to set a predicate register of mode MODE in which the first
4282    VL bits are set and the rest are clear.  Use TARGET for the register
4283    if it's nonnull and convenient.  */
4284
4285 static rtx
4286 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
4287                                  unsigned int vl)
4288 {
4289   rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
4290   target = aarch64_target_reg (target, mode);
4291   emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
4292                         target, const0_rtx, limit));
4293   return target;
4294 }
4295
4296 static rtx
4297 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
4298
4299 /* BUILDER is a constant predicate in which the index of every set bit
4300    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
4301    by inverting every element at a multiple of ELT_SIZE and EORing the
4302    result with an ELT_SIZE PTRUE.
4303
4304    Return a register that contains the constant on success, otherwise
4305    return null.  Use TARGET as the register if it is nonnull and
4306    convenient.  */
4307
4308 static rtx
4309 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
4310                                    unsigned int elt_size)
4311 {
4312   /* Invert every element at a multiple of ELT_SIZE, keeping the
4313      other bits zero.  */
4314   rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
4315                                   builder.nelts_per_pattern ());
4316   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4317     if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
4318       inv_builder.quick_push (const1_rtx);
4319     else
4320       inv_builder.quick_push (const0_rtx);
4321   inv_builder.finalize ();
4322
4323   /* See if we can load the constant cheaply.  */
4324   rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
4325   if (!inv)
4326     return NULL_RTX;
4327
4328   /* EOR the result with an ELT_SIZE PTRUE.  */
4329   rtx mask = aarch64_ptrue_all (elt_size);
4330   mask = force_reg (VNx16BImode, mask);
4331   target = aarch64_target_reg (target, VNx16BImode);
4332   emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
4333   return target;
4334 }
4335
4336 /* BUILDER is a constant predicate in which the index of every set bit
4337    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
4338    using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE.  Return the
4339    register on success, otherwise return null.  Use TARGET as the register
4340    if nonnull and convenient.  */
4341
4342 static rtx
4343 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
4344                                    unsigned int elt_size,
4345                                    unsigned int permute_size)
4346 {
4347   /* We're going to split the constant into two new constants A and B,
4348      with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
4349      and into B otherwise.  E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
4350
4351      A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
4352      B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
4353
4354      where _ indicates elements that will be discarded by the permute.
4355
4356      First calculate the ELT_SIZEs for A and B.  */
4357   unsigned int a_elt_size = GET_MODE_SIZE (DImode);
4358   unsigned int b_elt_size = GET_MODE_SIZE (DImode);
4359   for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
4360     if (INTVAL (builder.elt (i)) != 0)
4361       {
4362         if (i & permute_size)
4363           b_elt_size |= i - permute_size;
4364         else
4365           a_elt_size |= i;
4366       }
4367   a_elt_size &= -a_elt_size;
4368   b_elt_size &= -b_elt_size;
4369
4370   /* Now construct the vectors themselves.  */
4371   rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
4372                                 builder.nelts_per_pattern ());
4373   rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
4374                                 builder.nelts_per_pattern ());
4375   unsigned int nelts = builder.encoded_nelts ();
4376   for (unsigned int i = 0; i < nelts; ++i)
4377     if (i & (elt_size - 1))
4378       {
4379         a_builder.quick_push (const0_rtx);
4380         b_builder.quick_push (const0_rtx);
4381       }
4382     else if ((i & permute_size) == 0)
4383       {
4384         /* The A and B elements are significant.  */
4385         a_builder.quick_push (builder.elt (i));
4386         b_builder.quick_push (builder.elt (i + permute_size));
4387       }
4388     else
4389       {
4390         /* The A and B elements are going to be discarded, so pick whatever
4391            is likely to give a nice constant.  We are targeting element
4392            sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
4393            with the aim of each being a sequence of ones followed by
4394            a sequence of zeros.  So:
4395
4396            * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
4397              duplicate the last X_ELT_SIZE element, to extend the
4398              current sequence of ones or zeros.
4399
4400            * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
4401              zero, so that the constant really does have X_ELT_SIZE and
4402              not a smaller size.  */
4403         if (a_elt_size > permute_size)
4404           a_builder.quick_push (const0_rtx);
4405         else
4406           a_builder.quick_push (a_builder.elt (i - a_elt_size));
4407         if (b_elt_size > permute_size)
4408           b_builder.quick_push (const0_rtx);
4409         else
4410           b_builder.quick_push (b_builder.elt (i - b_elt_size));
4411       }
4412   a_builder.finalize ();
4413   b_builder.finalize ();
4414
4415   /* Try loading A into a register.  */
4416   rtx_insn *last = get_last_insn ();
4417   rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
4418   if (!a)
4419     return NULL_RTX;
4420
4421   /* Try loading B into a register.  */
4422   rtx b = a;
4423   if (a_builder != b_builder)
4424     {
4425       b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
4426       if (!b)
4427         {
4428           delete_insns_since (last);
4429           return NULL_RTX;
4430         }
4431     }
4432
4433   /* Emit the TRN1 itself.  */
4434   machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
4435   target = aarch64_target_reg (target, mode);
4436   emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
4437                               gen_lowpart (mode, a),
4438                               gen_lowpart (mode, b)));
4439   return target;
4440 }
4441
4442 /* Subroutine of aarch64_expand_sve_const_pred.  Try to load the VNx16BI
4443    constant in BUILDER into an SVE predicate register.  Return the register
4444    on success, otherwise return null.  Use TARGET for the register if
4445    nonnull and convenient.
4446
4447    ALLOW_RECURSE_P is true if we can use methods that would call this
4448    function recursively.  */
4449
4450 static rtx
4451 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
4452                                  bool allow_recurse_p)
4453 {
4454   if (builder.encoded_nelts () == 1)
4455     /* A PFALSE or a PTRUE .B ALL.  */
4456     return aarch64_emit_set_immediate (target, builder);
4457
4458   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
4459   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
4460     {
4461       /* If we can load the constant using PTRUE, use it as-is.  */
4462       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
4463       if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
4464         return aarch64_emit_set_immediate (target, builder);
4465
4466       /* Otherwise use WHILE to set the first VL bits.  */
4467       return aarch64_sve_move_pred_via_while (target, mode, vl);
4468     }
4469
4470   if (!allow_recurse_p)
4471     return NULL_RTX;
4472
4473   /* Try inverting the vector in element size ELT_SIZE and then EORing
4474      the result with an ELT_SIZE PTRUE.  */
4475   if (INTVAL (builder.elt (0)) == 0)
4476     if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
4477                                                      elt_size))
4478       return res;
4479
4480   /* Try using TRN1 to permute two simpler constants.  */
4481   for (unsigned int i = elt_size; i <= 8; i *= 2)
4482     if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
4483                                                      elt_size, i))
4484       return res;
4485
4486   return NULL_RTX;
4487 }
4488
4489 /* Return an SVE predicate register that contains the VNx16BImode
4490    constant in BUILDER, without going through the move expanders.
4491
4492    The returned register can have whatever mode seems most natural
4493    given the contents of BUILDER.  Use TARGET for the result if
4494    convenient.  */
4495
4496 static rtx
4497 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
4498 {
4499   /* Try loading the constant using pure predicate operations.  */
4500   if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
4501     return res;
4502
4503   /* Try forcing the constant to memory.  */
4504   if (builder.full_nelts ().is_constant ())
4505     if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
4506       {
4507         target = aarch64_target_reg (target, VNx16BImode);
4508         emit_move_insn (target, mem);
4509         return target;
4510       }
4511
4512   /* The last resort is to load the constant as an integer and then
4513      compare it against zero.  Use -1 for set bits in order to increase
4514      the changes of using SVE DUPM or an Advanced SIMD byte mask.  */
4515   rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
4516                                   builder.nelts_per_pattern ());
4517   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4518     int_builder.quick_push (INTVAL (builder.elt (i))
4519                             ? constm1_rtx : const0_rtx);
4520   return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
4521                                            int_builder.build ());
4522 }
4523
4524 /* Set DEST to immediate IMM.  */
4525
4526 void
4527 aarch64_expand_mov_immediate (rtx dest, rtx imm)
4528 {
4529   machine_mode mode = GET_MODE (dest);
4530
4531   /* Check on what type of symbol it is.  */
4532   scalar_int_mode int_mode;
4533   if ((GET_CODE (imm) == SYMBOL_REF
4534        || GET_CODE (imm) == LABEL_REF
4535        || GET_CODE (imm) == CONST
4536        || GET_CODE (imm) == CONST_POLY_INT)
4537       && is_a <scalar_int_mode> (mode, &int_mode))
4538     {
4539       rtx mem;
4540       poly_int64 offset;
4541       HOST_WIDE_INT const_offset;
4542       enum aarch64_symbol_type sty;
4543
4544       /* If we have (const (plus symbol offset)), separate out the offset
4545          before we start classifying the symbol.  */
4546       rtx base = strip_offset (imm, &offset);
4547
4548       /* We must always add an offset involving VL separately, rather than
4549          folding it into the relocation.  */
4550       if (!offset.is_constant (&const_offset))
4551         {
4552           if (!TARGET_SVE)
4553             {
4554               aarch64_report_sve_required ();
4555               return;
4556             }
4557           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
4558             emit_insn (gen_rtx_SET (dest, imm));
4559           else
4560             {
4561               /* Do arithmetic on 32-bit values if the result is smaller
4562                  than that.  */
4563               if (partial_subreg_p (int_mode, SImode))
4564                 {
4565                   /* It is invalid to do symbol calculations in modes
4566                      narrower than SImode.  */
4567                   gcc_assert (base == const0_rtx);
4568                   dest = gen_lowpart (SImode, dest);
4569                   int_mode = SImode;
4570                 }
4571               if (base != const0_rtx)
4572                 {
4573                   base = aarch64_force_temporary (int_mode, dest, base);
4574                   aarch64_add_offset (int_mode, dest, base, offset,
4575                                       NULL_RTX, NULL_RTX, false);
4576                 }
4577               else
4578                 aarch64_add_offset (int_mode, dest, base, offset,
4579                                     dest, NULL_RTX, false);
4580             }
4581           return;
4582         }
4583
4584       sty = aarch64_classify_symbol (base, const_offset);
4585       switch (sty)
4586         {
4587         case SYMBOL_FORCE_TO_MEM:
4588           if (const_offset != 0
4589               && targetm.cannot_force_const_mem (int_mode, imm))
4590             {
4591               gcc_assert (can_create_pseudo_p ());
4592               base = aarch64_force_temporary (int_mode, dest, base);
4593               aarch64_add_offset (int_mode, dest, base, const_offset,
4594                                   NULL_RTX, NULL_RTX, false);
4595               return;
4596             }
4597
4598           mem = force_const_mem (ptr_mode, imm);
4599           gcc_assert (mem);
4600
4601           /* If we aren't generating PC relative literals, then
4602              we need to expand the literal pool access carefully.
4603              This is something that needs to be done in a number
4604              of places, so could well live as a separate function.  */
4605           if (!aarch64_pcrelative_literal_loads)
4606             {
4607               gcc_assert (can_create_pseudo_p ());
4608               base = gen_reg_rtx (ptr_mode);
4609               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
4610               if (ptr_mode != Pmode)
4611                 base = convert_memory_address (Pmode, base);
4612               mem = gen_rtx_MEM (ptr_mode, base);
4613             }
4614
4615           if (int_mode != ptr_mode)
4616             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
4617
4618           emit_insn (gen_rtx_SET (dest, mem));
4619
4620           return;
4621
4622         case SYMBOL_SMALL_TLSGD:
4623         case SYMBOL_SMALL_TLSDESC:
4624         case SYMBOL_SMALL_TLSIE:
4625         case SYMBOL_SMALL_GOT_28K:
4626         case SYMBOL_SMALL_GOT_4G:
4627         case SYMBOL_TINY_GOT:
4628         case SYMBOL_TINY_TLSIE:
4629           if (const_offset != 0)
4630             {
4631               gcc_assert(can_create_pseudo_p ());
4632               base = aarch64_force_temporary (int_mode, dest, base);
4633               aarch64_add_offset (int_mode, dest, base, const_offset,
4634                                   NULL_RTX, NULL_RTX, false);
4635               return;
4636             }
4637           /* FALLTHRU */
4638
4639         case SYMBOL_SMALL_ABSOLUTE:
4640         case SYMBOL_TINY_ABSOLUTE:
4641         case SYMBOL_TLSLE12:
4642         case SYMBOL_TLSLE24:
4643         case SYMBOL_TLSLE32:
4644         case SYMBOL_TLSLE48:
4645           aarch64_load_symref_appropriately (dest, imm, sty);
4646           return;
4647
4648         default:
4649           gcc_unreachable ();
4650         }
4651     }
4652
4653   if (!CONST_INT_P (imm))
4654     {
4655       if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
4656         {
4657           /* Only the low bit of each .H, .S and .D element is defined,
4658              so we can set the upper bits to whatever we like.  If the
4659              predicate is all-true in MODE, prefer to set all the undefined
4660              bits as well, so that we can share a single .B predicate for
4661              all modes.  */
4662           if (imm == CONSTM1_RTX (mode))
4663             imm = CONSTM1_RTX (VNx16BImode);
4664
4665           /* All methods for constructing predicate modes wider than VNx16BI
4666              will set the upper bits of each element to zero.  Expose this
4667              by moving such constants as a VNx16BI, so that all bits are
4668              significant and so that constants for different modes can be
4669              shared.  The wider constant will still be available as a
4670              REG_EQUAL note.  */
4671           rtx_vector_builder builder;
4672           if (aarch64_get_sve_pred_bits (builder, imm))
4673             {
4674               rtx res = aarch64_expand_sve_const_pred (dest, builder);
4675               if (dest != res)
4676                 emit_move_insn (dest, gen_lowpart (mode, res));
4677               return;
4678             }
4679         }
4680
4681       if (GET_CODE (imm) == HIGH
4682           || aarch64_simd_valid_immediate (imm, NULL))
4683         {
4684           emit_insn (gen_rtx_SET (dest, imm));
4685           return;
4686         }
4687
4688       if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
4689         if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
4690           {
4691             if (dest != res)
4692               emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
4693             return;
4694           }
4695
4696       rtx mem = force_const_mem (mode, imm);
4697       gcc_assert (mem);
4698       emit_move_insn (dest, mem);
4699       return;
4700     }
4701
4702   aarch64_internal_mov_immediate (dest, imm, true,
4703                                   as_a <scalar_int_mode> (mode));
4704 }
4705
4706 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
4707    that is known to contain PTRUE.  */
4708
4709 void
4710 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
4711 {
4712   expand_operand ops[3];
4713   machine_mode mode = GET_MODE (dest);
4714   create_output_operand (&ops[0], dest, mode);
4715   create_input_operand (&ops[1], pred, GET_MODE(pred));
4716   create_input_operand (&ops[2], src, mode);
4717   temporary_volatile_ok v (true);
4718   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
4719 }
4720
4721 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
4722    operand is in memory.  In this case we need to use the predicated LD1
4723    and ST1 instead of LDR and STR, both for correctness on big-endian
4724    targets and because LD1 and ST1 support a wider range of addressing modes.
4725    PRED_MODE is the mode of the predicate.
4726
4727    See the comment at the head of aarch64-sve.md for details about the
4728    big-endian handling.  */
4729
4730 void
4731 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
4732 {
4733   machine_mode mode = GET_MODE (dest);
4734   rtx ptrue = aarch64_ptrue_reg (pred_mode);
4735   if (!register_operand (src, mode)
4736       && !register_operand (dest, mode))
4737     {
4738       rtx tmp = gen_reg_rtx (mode);
4739       if (MEM_P (src))
4740         aarch64_emit_sve_pred_move (tmp, ptrue, src);
4741       else
4742         emit_move_insn (tmp, src);
4743       src = tmp;
4744     }
4745   aarch64_emit_sve_pred_move (dest, ptrue, src);
4746 }
4747
4748 /* Called only on big-endian targets.  See whether an SVE vector move
4749    from SRC to DEST is effectively a REV[BHW] instruction, because at
4750    least one operand is a subreg of an SVE vector that has wider or
4751    narrower elements.  Return true and emit the instruction if so.
4752
4753    For example:
4754
4755      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4756
4757    represents a VIEW_CONVERT between the following vectors, viewed
4758    in memory order:
4759
4760      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
4761      R1: { [0],      [1],      [2],      [3],     ... }
4762
4763    The high part of lane X in R2 should therefore correspond to lane X*2
4764    of R1, but the register representations are:
4765
4766          msb                                      lsb
4767      R2: ...... [1].high  [1].low   [0].high  [0].low
4768      R1: ...... [3]       [2]       [1]       [0]
4769
4770    where the low part of lane X in R2 corresponds to lane X*2 in R1.
4771    We therefore need a reverse operation to swap the high and low values
4772    around.
4773
4774    This is purely an optimization.  Without it we would spill the
4775    subreg operand to the stack in one mode and reload it in the
4776    other mode, which has the same effect as the REV.  */
4777
4778 bool
4779 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
4780 {
4781   gcc_assert (BYTES_BIG_ENDIAN);
4782   if (GET_CODE (dest) == SUBREG)
4783     dest = SUBREG_REG (dest);
4784   if (GET_CODE (src) == SUBREG)
4785     src = SUBREG_REG (src);
4786
4787   /* The optimization handles two single SVE REGs with different element
4788      sizes.  */
4789   if (!REG_P (dest)
4790       || !REG_P (src)
4791       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
4792       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
4793       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
4794           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
4795     return false;
4796
4797   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
4798   rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
4799   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
4800                                UNSPEC_REV_SUBREG);
4801   emit_insn (gen_rtx_SET (dest, unspec));
4802   return true;
4803 }
4804
4805 /* Return a copy of X with mode MODE, without changing its other
4806    attributes.  Unlike gen_lowpart, this doesn't care whether the
4807    mode change is valid.  */
4808
4809 rtx
4810 aarch64_replace_reg_mode (rtx x, machine_mode mode)
4811 {
4812   if (GET_MODE (x) == mode)
4813     return x;
4814
4815   x = shallow_copy_rtx (x);
4816   set_mode_and_regno (x, mode, REGNO (x));
4817   return x;
4818 }
4819
4820 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
4821    stored in wider integer containers.  */
4822
4823 static unsigned int
4824 aarch64_sve_rev_unspec (machine_mode mode)
4825 {
4826   switch (GET_MODE_UNIT_SIZE (mode))
4827     {
4828     case 1: return UNSPEC_REVB;
4829     case 2: return UNSPEC_REVH;
4830     case 4: return UNSPEC_REVW;
4831     }
4832   gcc_unreachable ();
4833 }
4834
4835 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4836    operands.  */
4837
4838 void
4839 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
4840 {
4841   /* Decide which REV operation we need.  The mode with wider elements
4842      determines the mode of the operands and the mode with the narrower
4843      elements determines the reverse width.  */
4844   machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
4845   machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
4846   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
4847       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
4848     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
4849
4850   unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
4851   machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
4852
4853   /* Get the operands in the appropriate modes and emit the instruction.  */
4854   ptrue = gen_lowpart (pred_mode, ptrue);
4855   dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
4856   src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
4857   emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
4858                                dest, ptrue, src));
4859 }
4860
4861 static bool
4862 aarch64_function_ok_for_sibcall (tree, tree exp)
4863 {
4864   if (crtl->abi->id () != expr_callee_abi (exp).id ())
4865     return false;
4866
4867   return true;
4868 }
4869
4870 /* Implement TARGET_PASS_BY_REFERENCE.  */
4871
4872 static bool
4873 aarch64_pass_by_reference (cumulative_args_t pcum_v,
4874                            const function_arg_info &arg)
4875 {
4876   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4877   HOST_WIDE_INT size;
4878   machine_mode dummymode;
4879   int nregs;
4880
4881   unsigned int num_zr, num_pr;
4882   if (arg.type && aarch64_sve::builtin_type_p (arg.type, &num_zr, &num_pr))
4883     {
4884       if (pcum && !pcum->silent_p && !TARGET_SVE)
4885         /* We can't gracefully recover at this point, so make this a
4886            fatal error.  */
4887         fatal_error (input_location, "arguments of type %qT require"
4888                      " the SVE ISA extension", arg.type);
4889
4890       /* Variadic SVE types are passed by reference.  Normal non-variadic
4891          arguments are too if we've run out of registers.  */
4892       return (!arg.named
4893               || pcum->aapcs_nvrn + num_zr > NUM_FP_ARG_REGS
4894               || pcum->aapcs_nprn + num_pr > NUM_PR_ARG_REGS);
4895     }
4896
4897   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
4898   if (arg.mode == BLKmode && arg.type)
4899     size = int_size_in_bytes (arg.type);
4900   else
4901     /* No frontends can create types with variable-sized modes, so we
4902        shouldn't be asked to pass or return them.  */
4903     size = GET_MODE_SIZE (arg.mode).to_constant ();
4904
4905   /* Aggregates are passed by reference based on their size.  */
4906   if (arg.aggregate_type_p ())
4907     size = int_size_in_bytes (arg.type);
4908
4909   /* Variable sized arguments are always returned by reference.  */
4910   if (size < 0)
4911     return true;
4912
4913   /* Can this be a candidate to be passed in fp/simd register(s)?  */
4914   if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
4915                                                &dummymode, &nregs,
4916                                                NULL))
4917     return false;
4918
4919   /* Arguments which are variable sized or larger than 2 registers are
4920      passed by reference unless they are a homogenous floating point
4921      aggregate.  */
4922   return size > 2 * UNITS_PER_WORD;
4923 }
4924
4925 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
4926 static bool
4927 aarch64_return_in_msb (const_tree valtype)
4928 {
4929   machine_mode dummy_mode;
4930   int dummy_int;
4931
4932   /* Never happens in little-endian mode.  */
4933   if (!BYTES_BIG_ENDIAN)
4934     return false;
4935
4936   /* Only composite types smaller than or equal to 16 bytes can
4937      be potentially returned in registers.  */
4938   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
4939       || int_size_in_bytes (valtype) <= 0
4940       || int_size_in_bytes (valtype) > 16)
4941     return false;
4942
4943   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4944      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4945      is always passed/returned in the least significant bits of fp/simd
4946      register(s).  */
4947   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
4948                                                &dummy_mode, &dummy_int, NULL))
4949     return false;
4950
4951   return true;
4952 }
4953
4954 /* Subroutine of aarch64_function_value.  MODE is the mode of the argument
4955    after promotion, and after partial SVE types have been replaced by
4956    their integer equivalents.  */
4957 static rtx
4958 aarch64_function_value_1 (const_tree type, machine_mode mode)
4959 {
4960   unsigned int num_zr, num_pr;
4961   if (type && aarch64_sve::builtin_type_p (type, &num_zr, &num_pr))
4962     {
4963       /* Don't raise an error here if we're called when SVE is disabled,
4964          since this is really just a query function.  Other code must
4965          do that where appropriate.  */
4966       mode = TYPE_MODE_RAW (type);
4967       gcc_assert (VECTOR_MODE_P (mode)
4968                   && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
4969
4970       if (num_zr > 0 && num_pr == 0)
4971         return gen_rtx_REG (mode, V0_REGNUM);
4972
4973       if (num_zr == 0 && num_pr == 1)
4974         return gen_rtx_REG (mode, P0_REGNUM);
4975
4976       gcc_unreachable ();
4977     }
4978
4979   /* Generic vectors that map to SVE modes with -msve-vector-bits=N are
4980      returned in memory, not by value.  */
4981   gcc_assert (!aarch64_sve_mode_p (mode));
4982
4983   if (aarch64_return_in_msb (type))
4984     {
4985       HOST_WIDE_INT size = int_size_in_bytes (type);
4986
4987       if (size % UNITS_PER_WORD != 0)
4988         {
4989           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
4990           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
4991         }
4992     }
4993
4994   int count;
4995   machine_mode ag_mode;
4996   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4997                                                &ag_mode, &count, NULL))
4998     {
4999       if (!aarch64_composite_type_p (type, mode))
5000         {
5001           gcc_assert (count == 1 && mode == ag_mode);
5002           return gen_rtx_REG (mode, V0_REGNUM);
5003         }
5004       else
5005         {
5006           int i;
5007           rtx par;
5008
5009           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
5010           for (i = 0; i < count; i++)
5011             {
5012               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
5013               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
5014               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
5015               XVECEXP (par, 0, i) = tmp;
5016             }
5017           return par;
5018         }
5019     }
5020   else
5021     return gen_rtx_REG (mode, R0_REGNUM);
5022 }
5023
5024 /* Implement TARGET_FUNCTION_VALUE.
5025    Define how to find the value returned by a function.  */
5026
5027 static rtx
5028 aarch64_function_value (const_tree type, const_tree func,
5029                         bool outgoing ATTRIBUTE_UNUSED)
5030 {
5031   machine_mode mode;
5032   int unsignedp;
5033
5034   mode = TYPE_MODE (type);
5035   if (INTEGRAL_TYPE_P (type))
5036     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
5037
5038   /* Vector types can acquire a partial SVE mode using things like
5039      __attribute__((vector_size(N))), and this is potentially useful.
5040      However, the choice of mode doesn't affect the type's ABI identity,
5041      so we should treat the types as though they had the associated
5042      integer mode, just like they did before SVE was introduced.
5043
5044      We know that the vector must be 128 bits or smaller, otherwise we'd
5045      have returned it in memory instead.  */
5046   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5047   if ((vec_flags & VEC_ANY_SVE) && (vec_flags & VEC_PARTIAL))
5048     {
5049       scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
5050       rtx reg = aarch64_function_value_1 (type, int_mode);
5051       /* Vector types are never returned in the MSB and are never split.  */
5052       gcc_assert (REG_P (reg) && GET_MODE (reg) == int_mode);
5053       rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
5054       return gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, pair));
5055     }
5056
5057   return aarch64_function_value_1 (type, mode);
5058 }
5059
5060 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
5061    Return true if REGNO is the number of a hard register in which the values
5062    of called function may come back.  */
5063
5064 static bool
5065 aarch64_function_value_regno_p (const unsigned int regno)
5066 {
5067   /* Maximum of 16 bytes can be returned in the general registers.  Examples
5068      of 16-byte return values are: 128-bit integers and 16-byte small
5069      structures (excluding homogeneous floating-point aggregates).  */
5070   if (regno == R0_REGNUM || regno == R1_REGNUM)
5071     return true;
5072
5073   /* Up to four fp/simd registers can return a function value, e.g. a
5074      homogeneous floating-point aggregate having four members.  */
5075   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
5076     return TARGET_FLOAT;
5077
5078   return false;
5079 }
5080
5081 /* Implement TARGET_RETURN_IN_MEMORY.
5082
5083    If the type T of the result of a function is such that
5084      void func (T arg)
5085    would require that arg be passed as a value in a register (or set of
5086    registers) according to the parameter passing rules, then the result
5087    is returned in the same registers as would be used for such an
5088    argument.  */
5089
5090 static bool
5091 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
5092 {
5093   HOST_WIDE_INT size;
5094   machine_mode ag_mode;
5095   int count;
5096
5097   if (!AGGREGATE_TYPE_P (type)
5098       && TREE_CODE (type) != COMPLEX_TYPE
5099       && TREE_CODE (type) != VECTOR_TYPE)
5100     /* Simple scalar types always returned in registers.  */
5101     return false;
5102
5103   unsigned int num_zr, num_pr;
5104   if (type && aarch64_sve::builtin_type_p (type, &num_zr, &num_pr))
5105     {
5106       /* All SVE types we support fit in registers.  For example, it isn't
5107          yet possible to define an aggregate of 9+ SVE vectors or 5+ SVE
5108          predicates.  */
5109       gcc_assert (num_zr <= NUM_FP_ARG_REGS && num_pr <= NUM_PR_ARG_REGS);
5110       return false;
5111     }
5112
5113   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
5114                                                type,
5115                                                &ag_mode,
5116                                                &count,
5117                                                NULL))
5118     return false;
5119
5120   /* Types larger than 2 registers returned in memory.  */
5121   size = int_size_in_bytes (type);
5122   return (size < 0 || size > 2 * UNITS_PER_WORD);
5123 }
5124
5125 static bool
5126 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
5127                                const_tree type, int *nregs)
5128 {
5129   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5130   return aarch64_vfp_is_call_or_return_candidate (mode,
5131                                                   type,
5132                                                   &pcum->aapcs_vfp_rmode,
5133                                                   nregs,
5134                                                   NULL);
5135 }
5136
5137 /* Given MODE and TYPE of a function argument, return the alignment in
5138    bits.  The idea is to suppress any stronger alignment requested by
5139    the user and opt for the natural alignment (specified in AAPCS64 \S
5140    4.1).  ABI_BREAK is set to true if the alignment was incorrectly
5141    calculated in versions of GCC prior to GCC-9.  This is a helper
5142    function for local use only.  */
5143
5144 static unsigned int
5145 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
5146                                 bool *abi_break)
5147 {
5148   *abi_break = false;
5149   if (!type)
5150     return GET_MODE_ALIGNMENT (mode);
5151
5152   if (integer_zerop (TYPE_SIZE (type)))
5153     return 0;
5154
5155   gcc_assert (TYPE_MODE (type) == mode);
5156
5157   if (!AGGREGATE_TYPE_P (type))
5158     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
5159
5160   if (TREE_CODE (type) == ARRAY_TYPE)
5161     return TYPE_ALIGN (TREE_TYPE (type));
5162
5163   unsigned int alignment = 0;
5164   unsigned int bitfield_alignment = 0;
5165   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5166     if (TREE_CODE (field) == FIELD_DECL)
5167       {
5168         alignment = std::max (alignment, DECL_ALIGN (field));
5169         if (DECL_BIT_FIELD_TYPE (field))
5170           bitfield_alignment
5171             = std::max (bitfield_alignment,
5172                         TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
5173       }
5174
5175   if (bitfield_alignment > alignment)
5176     {
5177       *abi_break = true;
5178       return bitfield_alignment;
5179     }
5180
5181   return alignment;
5182 }
5183
5184 /* Layout a function argument according to the AAPCS64 rules.  The rule
5185    numbers refer to the rule numbers in the AAPCS64.  ORIG_MODE is the
5186    mode that was originally given to us by the target hook, whereas the
5187    mode in ARG might be the result of replacing partial SVE modes with
5188    the equivalent integer mode.  */
5189
5190 static void
5191 aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg,
5192                     machine_mode orig_mode)
5193 {
5194   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5195   tree type = arg.type;
5196   machine_mode mode = arg.mode;
5197   int ncrn, nvrn, nregs;
5198   bool allocate_ncrn, allocate_nvrn;
5199   HOST_WIDE_INT size;
5200   bool abi_break;
5201
5202   /* We need to do this once per argument.  */
5203   if (pcum->aapcs_arg_processed)
5204     return;
5205
5206   /* Vector types can acquire a partial SVE mode using things like
5207      __attribute__((vector_size(N))), and this is potentially useful.
5208      However, the choice of mode doesn't affect the type's ABI identity,
5209      so we should treat the types as though they had the associated
5210      integer mode, just like they did before SVE was introduced.
5211
5212      We know that the vector must be 128 bits or smaller, otherwise we'd
5213      have passed it by reference instead.  */
5214   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5215   if ((vec_flags & VEC_ANY_SVE) && (vec_flags & VEC_PARTIAL))
5216     {
5217       function_arg_info tmp_arg = arg;
5218       tmp_arg.mode = int_mode_for_mode (mode).require ();
5219       aarch64_layout_arg (pcum_v, tmp_arg, orig_mode);
5220       if (rtx reg = pcum->aapcs_reg)
5221         {
5222           gcc_assert (REG_P (reg) && GET_MODE (reg) == tmp_arg.mode);
5223           rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
5224           pcum->aapcs_reg = gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
5225         }
5226       return;
5227     }
5228
5229   pcum->aapcs_arg_processed = true;
5230
5231   unsigned int num_zr, num_pr;
5232   if (type && aarch64_sve::builtin_type_p (type, &num_zr, &num_pr))
5233     {
5234       /* The PCS says that it is invalid to pass an SVE value to an
5235          unprototyped function.  There is no ABI-defined location we
5236          can return in this case, so we have no real choice but to raise
5237          an error immediately, even though this is only a query function.  */
5238       if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
5239         {
5240           gcc_assert (!pcum->silent_p);
5241           error ("SVE type %qT cannot be passed to an unprototyped function",
5242                  arg.type);
5243           /* Avoid repeating the message, and avoid tripping the assert
5244              below.  */
5245           pcum->pcs_variant = ARM_PCS_SVE;
5246         }
5247
5248       /* We would have converted the argument into pass-by-reference
5249          form if it didn't fit in registers.  */
5250       pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + num_zr;
5251       pcum->aapcs_nextnprn = pcum->aapcs_nprn + num_pr;
5252       gcc_assert (arg.named
5253                   && pcum->pcs_variant == ARM_PCS_SVE
5254                   && aarch64_sve_mode_p (mode)
5255                   && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
5256                   && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
5257
5258       if (num_zr > 0 && num_pr == 0)
5259         pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + pcum->aapcs_nvrn);
5260       else if (num_zr == 0 && num_pr == 1)
5261         pcum->aapcs_reg = gen_rtx_REG (mode, P0_REGNUM + pcum->aapcs_nprn);
5262       else
5263         gcc_unreachable ();
5264       return;
5265     }
5266
5267   /* Generic vectors that map to SVE modes with -msve-vector-bits=N are
5268      passed by reference, not by value.  */
5269   gcc_assert (!aarch64_sve_mode_p (mode));
5270
5271   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
5272   if (type)
5273     size = int_size_in_bytes (type);
5274   else
5275     /* No frontends can create types with variable-sized modes, so we
5276        shouldn't be asked to pass or return them.  */
5277     size = GET_MODE_SIZE (mode).to_constant ();
5278   size = ROUND_UP (size, UNITS_PER_WORD);
5279
5280   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
5281   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
5282                                                  mode,
5283                                                  type,
5284                                                  &nregs);
5285
5286   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
5287      The following code thus handles passing by SIMD/FP registers first.  */
5288
5289   nvrn = pcum->aapcs_nvrn;
5290
5291   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
5292      and homogenous short-vector aggregates (HVA).  */
5293   if (allocate_nvrn)
5294     {
5295       if (!pcum->silent_p && !TARGET_FLOAT)
5296         aarch64_err_no_fpadvsimd (mode);
5297
5298       if (nvrn + nregs <= NUM_FP_ARG_REGS)
5299         {
5300           pcum->aapcs_nextnvrn = nvrn + nregs;
5301           if (!aarch64_composite_type_p (type, mode))
5302             {
5303               gcc_assert (nregs == 1);
5304               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
5305             }
5306           else
5307             {
5308               rtx par;
5309               int i;
5310               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
5311               for (i = 0; i < nregs; i++)
5312                 {
5313                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
5314                                          V0_REGNUM + nvrn + i);
5315                   rtx offset = gen_int_mode
5316                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
5317                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
5318                   XVECEXP (par, 0, i) = tmp;
5319                 }
5320               pcum->aapcs_reg = par;
5321             }
5322           return;
5323         }
5324       else
5325         {
5326           /* C.3 NSRN is set to 8.  */
5327           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
5328           goto on_stack;
5329         }
5330     }
5331
5332   ncrn = pcum->aapcs_ncrn;
5333   nregs = size / UNITS_PER_WORD;
5334
5335   /* C6 - C9.  though the sign and zero extension semantics are
5336      handled elsewhere.  This is the case where the argument fits
5337      entirely general registers.  */
5338   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
5339     {
5340       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
5341
5342       /* C.8 if the argument has an alignment of 16 then the NGRN is
5343          rounded up to the next even number.  */
5344       if (nregs == 2
5345           && ncrn % 2
5346           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
5347              comparison is there because for > 16 * BITS_PER_UNIT
5348              alignment nregs should be > 2 and therefore it should be
5349              passed by reference rather than value.  */
5350           && (aarch64_function_arg_alignment (orig_mode, type, &abi_break)
5351               == 16 * BITS_PER_UNIT))
5352         {
5353           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
5354             inform (input_location, "parameter passing for argument of type "
5355                     "%qT changed in GCC 9.1", type);
5356           ++ncrn;
5357           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
5358         }
5359
5360       /* NREGS can be 0 when e.g. an empty structure is to be passed.
5361          A reg is still generated for it, but the caller should be smart
5362          enough not to use it.  */
5363       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
5364         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
5365       else
5366         {
5367           rtx par;
5368           int i;
5369
5370           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
5371           for (i = 0; i < nregs; i++)
5372             {
5373               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
5374               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
5375                                        GEN_INT (i * UNITS_PER_WORD));
5376               XVECEXP (par, 0, i) = tmp;
5377             }
5378           pcum->aapcs_reg = par;
5379         }
5380
5381       pcum->aapcs_nextncrn = ncrn + nregs;
5382       return;
5383     }
5384
5385   /* C.11  */
5386   pcum->aapcs_nextncrn = NUM_ARG_REGS;
5387
5388   /* The argument is passed on stack; record the needed number of words for
5389      this argument and align the total size if necessary.  */
5390 on_stack:
5391   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
5392
5393   if (aarch64_function_arg_alignment (orig_mode, type, &abi_break)
5394       == 16 * BITS_PER_UNIT)
5395     {
5396       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
5397       if (pcum->aapcs_stack_size != new_size)
5398         {
5399           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
5400             inform (input_location, "parameter passing for argument of type "
5401                     "%qT changed in GCC 9.1", type);
5402           pcum->aapcs_stack_size = new_size;
5403         }
5404     }
5405   return;
5406 }
5407
5408 /* Implement TARGET_FUNCTION_ARG.  */
5409
5410 static rtx
5411 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
5412 {
5413   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5414   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
5415               || pcum->pcs_variant == ARM_PCS_SIMD
5416               || pcum->pcs_variant == ARM_PCS_SVE);
5417
5418   if (arg.end_marker_p ())
5419     return gen_int_mode (pcum->pcs_variant, DImode);
5420
5421   aarch64_layout_arg (pcum_v, arg, arg.mode);
5422   return pcum->aapcs_reg;
5423 }
5424
5425 void
5426 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
5427                               const_tree fntype,
5428                               rtx libname ATTRIBUTE_UNUSED,
5429                               const_tree fndecl ATTRIBUTE_UNUSED,
5430                               unsigned n_named ATTRIBUTE_UNUSED,
5431                               bool silent_p)
5432 {
5433   pcum->aapcs_ncrn = 0;
5434   pcum->aapcs_nvrn = 0;
5435   pcum->aapcs_nprn = 0;
5436   pcum->aapcs_nextncrn = 0;
5437   pcum->aapcs_nextnvrn = 0;
5438   pcum->aapcs_nextnprn = 0;
5439   if (fntype)
5440     pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
5441   else
5442     pcum->pcs_variant = ARM_PCS_AAPCS64;
5443   pcum->aapcs_reg = NULL_RTX;
5444   pcum->aapcs_arg_processed = false;
5445   pcum->aapcs_stack_words = 0;
5446   pcum->aapcs_stack_size = 0;
5447   pcum->silent_p = silent_p;
5448
5449   if (!silent_p
5450       && !TARGET_FLOAT
5451       && fndecl && TREE_PUBLIC (fndecl)
5452       && fntype && fntype != error_mark_node)
5453     {
5454       const_tree type = TREE_TYPE (fntype);
5455       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
5456       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
5457       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
5458                                                    &mode, &nregs, NULL))
5459         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
5460     }
5461
5462   if (!silent_p
5463       && !TARGET_SVE
5464       && pcum->pcs_variant == ARM_PCS_SVE)
5465     {
5466       /* We can't gracefully recover at this point, so make this a
5467          fatal error.  */
5468       if (fndecl)
5469         fatal_error (input_location, "%qE requires the SVE ISA extension",
5470                      fndecl);
5471       else
5472         fatal_error (input_location, "calls to functions of type %qT require"
5473                      " the SVE ISA extension", fntype);
5474     }
5475 }
5476
5477 static void
5478 aarch64_function_arg_advance (cumulative_args_t pcum_v,
5479                               const function_arg_info &arg)
5480 {
5481   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5482   if (pcum->pcs_variant == ARM_PCS_AAPCS64
5483       || pcum->pcs_variant == ARM_PCS_SIMD
5484       || pcum->pcs_variant == ARM_PCS_SVE)
5485     {
5486       aarch64_layout_arg (pcum_v, arg, arg.mode);
5487       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
5488                   != (pcum->aapcs_stack_words != 0));
5489       pcum->aapcs_arg_processed = false;
5490       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
5491       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
5492       pcum->aapcs_nprn = pcum->aapcs_nextnprn;
5493       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
5494       pcum->aapcs_stack_words = 0;
5495       pcum->aapcs_reg = NULL_RTX;
5496     }
5497 }
5498
5499 bool
5500 aarch64_function_arg_regno_p (unsigned regno)
5501 {
5502   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
5503           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
5504 }
5505
5506 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
5507    PARM_BOUNDARY bits of alignment, but will be given anything up
5508    to STACK_BOUNDARY bits if the type requires it.  This makes sure
5509    that both before and after the layout of each argument, the Next
5510    Stacked Argument Address (NSAA) will have a minimum alignment of
5511    8 bytes.  */
5512
5513 static unsigned int
5514 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
5515 {
5516   bool abi_break;
5517   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
5518                                                            &abi_break);
5519   if (abi_break & warn_psabi)
5520     inform (input_location, "parameter passing for argument of type "
5521             "%qT changed in GCC 9.1", type);
5522
5523   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
5524 }
5525
5526 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
5527
5528 static fixed_size_mode
5529 aarch64_get_reg_raw_mode (int regno)
5530 {
5531   if (TARGET_SVE && FP_REGNUM_P (regno))
5532     /* Don't use the SVE part of the register for __builtin_apply and
5533        __builtin_return.  The SVE registers aren't used by the normal PCS,
5534        so using them there would be a waste of time.  The PCS extensions
5535        for SVE types are fundamentally incompatible with the
5536        __builtin_return/__builtin_apply interface.  */
5537     return as_a <fixed_size_mode> (V16QImode);
5538   return default_get_reg_raw_mode (regno);
5539 }
5540
5541 /* Implement TARGET_FUNCTION_ARG_PADDING.
5542
5543    Small aggregate types are placed in the lowest memory address.
5544
5545    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
5546
5547 static pad_direction
5548 aarch64_function_arg_padding (machine_mode mode, const_tree type)
5549 {
5550   /* On little-endian targets, the least significant byte of every stack
5551      argument is passed at the lowest byte address of the stack slot.  */
5552   if (!BYTES_BIG_ENDIAN)
5553     return PAD_UPWARD;
5554
5555   /* Otherwise, integral, floating-point and pointer types are padded downward:
5556      the least significant byte of a stack argument is passed at the highest
5557      byte address of the stack slot.  */
5558   if (type
5559       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
5560          || POINTER_TYPE_P (type))
5561       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
5562     return PAD_DOWNWARD;
5563
5564   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
5565   return PAD_UPWARD;
5566 }
5567
5568 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
5569
5570    It specifies padding for the last (may also be the only)
5571    element of a block move between registers and memory.  If
5572    assuming the block is in the memory, padding upward means that
5573    the last element is padded after its highest significant byte,
5574    while in downward padding, the last element is padded at the
5575    its least significant byte side.
5576
5577    Small aggregates and small complex types are always padded
5578    upwards.
5579
5580    We don't need to worry about homogeneous floating-point or
5581    short-vector aggregates; their move is not affected by the
5582    padding direction determined here.  Regardless of endianness,
5583    each element of such an aggregate is put in the least
5584    significant bits of a fp/simd register.
5585
5586    Return !BYTES_BIG_ENDIAN if the least significant byte of the
5587    register has useful data, and return the opposite if the most
5588    significant byte does.  */
5589
5590 bool
5591 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
5592                      bool first ATTRIBUTE_UNUSED)
5593 {
5594
5595   /* Small composite types are always padded upward.  */
5596   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
5597     {
5598       HOST_WIDE_INT size;
5599       if (type)
5600         size = int_size_in_bytes (type);
5601       else
5602         /* No frontends can create types with variable-sized modes, so we
5603            shouldn't be asked to pass or return them.  */
5604         size = GET_MODE_SIZE (mode).to_constant ();
5605       if (size < 2 * UNITS_PER_WORD)
5606         return true;
5607     }
5608
5609   /* Otherwise, use the default padding.  */
5610   return !BYTES_BIG_ENDIAN;
5611 }
5612
5613 static scalar_int_mode
5614 aarch64_libgcc_cmp_return_mode (void)
5615 {
5616   return SImode;
5617 }
5618
5619 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
5620
5621 /* We use the 12-bit shifted immediate arithmetic instructions so values
5622    must be multiple of (1 << 12), i.e. 4096.  */
5623 #define ARITH_FACTOR 4096
5624
5625 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
5626 #error Cannot use simple address calculation for stack probing
5627 #endif
5628
5629 /* The pair of scratch registers used for stack probing.  */
5630 #define PROBE_STACK_FIRST_REG  R9_REGNUM
5631 #define PROBE_STACK_SECOND_REG R10_REGNUM
5632
5633 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
5634    inclusive.  These are offsets from the current stack pointer.  */
5635
5636 static void
5637 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
5638 {
5639   HOST_WIDE_INT size;
5640   if (!poly_size.is_constant (&size))
5641     {
5642       sorry ("stack probes for SVE frames");
5643       return;
5644     }
5645
5646   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
5647
5648   /* See the same assertion on PROBE_INTERVAL above.  */
5649   gcc_assert ((first % ARITH_FACTOR) == 0);
5650
5651   /* See if we have a constant small number of probes to generate.  If so,
5652      that's the easy case.  */
5653   if (size <= PROBE_INTERVAL)
5654     {
5655       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
5656
5657       emit_set_insn (reg1,
5658                      plus_constant (Pmode,
5659                                     stack_pointer_rtx, -(first + base)));
5660       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
5661     }
5662
5663   /* The run-time loop is made up of 8 insns in the generic case while the
5664      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
5665   else if (size <= 4 * PROBE_INTERVAL)
5666     {
5667       HOST_WIDE_INT i, rem;
5668
5669       emit_set_insn (reg1,
5670                      plus_constant (Pmode,
5671                                     stack_pointer_rtx,
5672                                     -(first + PROBE_INTERVAL)));
5673       emit_stack_probe (reg1);
5674
5675       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
5676          it exceeds SIZE.  If only two probes are needed, this will not
5677          generate any code.  Then probe at FIRST + SIZE.  */
5678       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
5679         {
5680           emit_set_insn (reg1,
5681                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
5682           emit_stack_probe (reg1);
5683         }
5684
5685       rem = size - (i - PROBE_INTERVAL);
5686       if (rem > 256)
5687         {
5688           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5689
5690           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
5691           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
5692         }
5693       else
5694         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
5695     }
5696
5697   /* Otherwise, do the same as above, but in a loop.  Note that we must be
5698      extra careful with variables wrapping around because we might be at
5699      the very top (or the very bottom) of the address space and we have
5700      to be able to handle this case properly; in particular, we use an
5701      equality test for the loop condition.  */
5702   else
5703     {
5704       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
5705
5706       /* Step 1: round SIZE to the previous multiple of the interval.  */
5707
5708       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
5709
5710
5711       /* Step 2: compute initial and final value of the loop counter.  */
5712
5713       /* TEST_ADDR = SP + FIRST.  */
5714       emit_set_insn (reg1,
5715                      plus_constant (Pmode, stack_pointer_rtx, -first));
5716
5717       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
5718       HOST_WIDE_INT adjustment = - (first + rounded_size);
5719       if (! aarch64_uimm12_shift (adjustment))
5720         {
5721           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
5722                                           true, Pmode);
5723           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
5724         }
5725       else
5726         emit_set_insn (reg2,
5727                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
5728
5729       /* Step 3: the loop
5730
5731          do
5732            {
5733              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
5734              probe at TEST_ADDR
5735            }
5736          while (TEST_ADDR != LAST_ADDR)
5737
5738          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
5739          until it is equal to ROUNDED_SIZE.  */
5740
5741       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
5742
5743
5744       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
5745          that SIZE is equal to ROUNDED_SIZE.  */
5746
5747       if (size != rounded_size)
5748         {
5749           HOST_WIDE_INT rem = size - rounded_size;
5750
5751           if (rem > 256)
5752             {
5753               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5754
5755               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
5756               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
5757             }
5758           else
5759             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
5760         }
5761     }
5762
5763   /* Make sure nothing is scheduled before we are done.  */
5764   emit_insn (gen_blockage ());
5765 }
5766
5767 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
5768    absolute addresses.  */
5769
5770 const char *
5771 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
5772 {
5773   static int labelno = 0;
5774   char loop_lab[32];
5775   rtx xops[2];
5776
5777   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
5778
5779   /* Loop.  */
5780   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
5781
5782   HOST_WIDE_INT stack_clash_probe_interval
5783     = 1 << param_stack_clash_protection_guard_size;
5784
5785   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
5786   xops[0] = reg1;
5787   HOST_WIDE_INT interval;
5788   if (flag_stack_clash_protection)
5789     interval = stack_clash_probe_interval;
5790   else
5791     interval = PROBE_INTERVAL;
5792
5793   gcc_assert (aarch64_uimm12_shift (interval));
5794   xops[1] = GEN_INT (interval);
5795
5796   output_asm_insn ("sub\t%0, %0, %1", xops);
5797
5798   /* If doing stack clash protection then we probe up by the ABI specified
5799      amount.  We do this because we're dropping full pages at a time in the
5800      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
5801   if (flag_stack_clash_protection)
5802     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
5803   else
5804     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
5805
5806   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
5807      by this amount for each iteration.  */
5808   output_asm_insn ("str\txzr, [%0, %1]", xops);
5809
5810   /* Test if TEST_ADDR == LAST_ADDR.  */
5811   xops[1] = reg2;
5812   output_asm_insn ("cmp\t%0, %1", xops);
5813
5814   /* Branch.  */
5815   fputs ("\tb.ne\t", asm_out_file);
5816   assemble_name_raw (asm_out_file, loop_lab);
5817   fputc ('\n', asm_out_file);
5818
5819   return "";
5820 }
5821
5822 /* Emit the probe loop for doing stack clash probes and stack adjustments for
5823    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
5824    of GUARD_SIZE.  When a probe is emitted it is done at most
5825    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
5826    at most MIN_PROBE_THRESHOLD.  By the end of this function
5827    BASE = BASE - ADJUSTMENT.  */
5828
5829 const char *
5830 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
5831                                       rtx min_probe_threshold, rtx guard_size)
5832 {
5833   /* This function is not allowed to use any instruction generation function
5834      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
5835      so instead emit the code you want using output_asm_insn.  */
5836   gcc_assert (flag_stack_clash_protection);
5837   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
5838   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
5839
5840   /* The minimum required allocation before the residual requires probing.  */
5841   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
5842
5843   /* Clamp the value down to the nearest value that can be used with a cmp.  */
5844   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
5845   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
5846
5847   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
5848   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
5849
5850   static int labelno = 0;
5851   char loop_start_lab[32];
5852   char loop_end_lab[32];
5853   rtx xops[2];
5854
5855   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
5856   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
5857
5858   /* Emit loop start label.  */
5859   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
5860
5861   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
5862   xops[0] = adjustment;
5863   xops[1] = probe_offset_value_rtx;
5864   output_asm_insn ("cmp\t%0, %1", xops);
5865
5866   /* Branch to end if not enough adjustment to probe.  */
5867   fputs ("\tb.lt\t", asm_out_file);
5868   assemble_name_raw (asm_out_file, loop_end_lab);
5869   fputc ('\n', asm_out_file);
5870
5871   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
5872   xops[0] = base;
5873   xops[1] = probe_offset_value_rtx;
5874   output_asm_insn ("sub\t%0, %0, %1", xops);
5875
5876   /* Probe at BASE.  */
5877   xops[1] = const0_rtx;
5878   output_asm_insn ("str\txzr, [%0, %1]", xops);
5879
5880   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
5881   xops[0] = adjustment;
5882   xops[1] = probe_offset_value_rtx;
5883   output_asm_insn ("sub\t%0, %0, %1", xops);
5884
5885   /* Branch to start if still more bytes to allocate.  */
5886   fputs ("\tb\t", asm_out_file);
5887   assemble_name_raw (asm_out_file, loop_start_lab);
5888   fputc ('\n', asm_out_file);
5889
5890   /* No probe leave.  */
5891   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
5892
5893   /* BASE = BASE - ADJUSTMENT.  */
5894   xops[0] = base;
5895   xops[1] = adjustment;
5896   output_asm_insn ("sub\t%0, %0, %1", xops);
5897   return "";
5898 }
5899
5900 /* Determine whether a frame chain needs to be generated.  */
5901 static bool
5902 aarch64_needs_frame_chain (void)
5903 {
5904   /* Force a frame chain for EH returns so the return address is at FP+8.  */
5905   if (frame_pointer_needed || crtl->calls_eh_return)
5906     return true;
5907
5908   /* A leaf function cannot have calls or write LR.  */
5909   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
5910
5911   /* Don't use a frame chain in leaf functions if leaf frame pointers
5912      are disabled.  */
5913   if (flag_omit_leaf_frame_pointer && is_leaf)
5914     return false;
5915
5916   return aarch64_use_frame_pointer;
5917 }
5918
5919 /* Mark the registers that need to be saved by the callee and calculate
5920    the size of the callee-saved registers area and frame record (both FP
5921    and LR may be omitted).  */
5922 static void
5923 aarch64_layout_frame (void)
5924 {
5925   poly_int64 offset = 0;
5926   int regno, last_fp_reg = INVALID_REGNUM;
5927   machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
5928   poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
5929   bool frame_related_fp_reg_p = false;
5930   aarch64_frame &frame = cfun->machine->frame;
5931
5932   frame.emit_frame_chain = aarch64_needs_frame_chain ();
5933
5934   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
5935      the mid-end is doing.  */
5936   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
5937
5938 #define SLOT_NOT_REQUIRED (-2)
5939 #define SLOT_REQUIRED     (-1)
5940
5941   frame.wb_candidate1 = INVALID_REGNUM;
5942   frame.wb_candidate2 = INVALID_REGNUM;
5943   frame.spare_pred_reg = INVALID_REGNUM;
5944
5945   /* First mark all the registers that really need to be saved...  */
5946   for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5947     frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5948
5949   /* ... that includes the eh data registers (if needed)...  */
5950   if (crtl->calls_eh_return)
5951     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
5952       frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
5953
5954   /* ... and any callee saved register that dataflow says is live.  */
5955   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5956     if (df_regs_ever_live_p (regno)
5957         && !fixed_regs[regno]
5958         && (regno == R30_REGNUM
5959             || !crtl->abi->clobbers_full_reg_p (regno)))
5960       frame.reg_offset[regno] = SLOT_REQUIRED;
5961
5962   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5963     if (df_regs_ever_live_p (regno)
5964         && !fixed_regs[regno]
5965         && !crtl->abi->clobbers_full_reg_p (regno))
5966       {
5967         frame.reg_offset[regno] = SLOT_REQUIRED;
5968         last_fp_reg = regno;
5969         if (aarch64_emit_cfi_for_reg_p (regno))
5970           frame_related_fp_reg_p = true;
5971       }
5972
5973   /* Big-endian SVE frames need a spare predicate register in order
5974      to save Z8-Z15.  Decide which register they should use.  Prefer
5975      an unused argument register if possible, so that we don't force P4
5976      to be saved unnecessarily.  */
5977   if (frame_related_fp_reg_p
5978       && crtl->abi->id () == ARM_PCS_SVE
5979       && BYTES_BIG_ENDIAN)
5980     {
5981       bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
5982       bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
5983       for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
5984         if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
5985           break;
5986       gcc_assert (regno <= P7_REGNUM);
5987       frame.spare_pred_reg = regno;
5988       df_set_regs_ever_live (regno, true);
5989     }
5990
5991   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
5992     if (df_regs_ever_live_p (regno)
5993         && !fixed_regs[regno]
5994         && !crtl->abi->clobbers_full_reg_p (regno))
5995       frame.reg_offset[regno] = SLOT_REQUIRED;
5996
5997   /* With stack-clash, LR must be saved in non-leaf functions.  */
5998   gcc_assert (crtl->is_leaf
5999               || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
6000
6001   /* Now assign stack slots for the registers.  Start with the predicate
6002      registers, since predicate LDR and STR have a relatively small
6003      offset range.  These saves happen below the hard frame pointer.  */
6004   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
6005     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6006       {
6007         frame.reg_offset[regno] = offset;
6008         offset += BYTES_PER_SVE_PRED;
6009       }
6010
6011   if (maybe_ne (offset, 0))
6012     {
6013       /* If we have any vector registers to save above the predicate registers,
6014          the offset of the vector register save slots need to be a multiple
6015          of the vector size.  This lets us use the immediate forms of LDR/STR
6016          (or LD1/ST1 for big-endian).
6017
6018          A vector register is 8 times the size of a predicate register,
6019          and we need to save a maximum of 12 predicate registers, so the
6020          first vector register will be at either #1, MUL VL or #2, MUL VL.
6021
6022          If we don't have any vector registers to save, and we know how
6023          big the predicate save area is, we can just round it up to the
6024          next 16-byte boundary.  */
6025       if (last_fp_reg == (int) INVALID_REGNUM && offset.is_constant ())
6026         offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6027       else
6028         {
6029           if (known_le (offset, vector_save_size))
6030             offset = vector_save_size;
6031           else if (known_le (offset, vector_save_size * 2))
6032             offset = vector_save_size * 2;
6033           else
6034             gcc_unreachable ();
6035         }
6036     }
6037
6038   /* If we need to save any SVE vector registers, add them next.  */
6039   if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
6040     for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6041       if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6042         {
6043           frame.reg_offset[regno] = offset;
6044           offset += vector_save_size;
6045         }
6046
6047   /* OFFSET is now the offset of the hard frame pointer from the bottom
6048      of the callee save area.  */
6049   bool saves_below_hard_fp_p = maybe_ne (offset, 0);
6050   frame.below_hard_fp_saved_regs_size = offset;
6051   if (frame.emit_frame_chain)
6052     {
6053       /* FP and LR are placed in the linkage record.  */
6054       frame.reg_offset[R29_REGNUM] = offset;
6055       frame.wb_candidate1 = R29_REGNUM;
6056       frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
6057       frame.wb_candidate2 = R30_REGNUM;
6058       offset += 2 * UNITS_PER_WORD;
6059     }
6060
6061   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
6062     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6063       {
6064         frame.reg_offset[regno] = offset;
6065         if (frame.wb_candidate1 == INVALID_REGNUM)
6066           frame.wb_candidate1 = regno;
6067         else if (frame.wb_candidate2 == INVALID_REGNUM)
6068           frame.wb_candidate2 = regno;
6069         offset += UNITS_PER_WORD;
6070       }
6071
6072   poly_int64 max_int_offset = offset;
6073   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6074   bool has_align_gap = maybe_ne (offset, max_int_offset);
6075
6076   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6077     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6078       {
6079         /* If there is an alignment gap between integer and fp callee-saves,
6080            allocate the last fp register to it if possible.  */
6081         if (regno == last_fp_reg
6082             && has_align_gap
6083             && known_eq (vector_save_size, 8)
6084             && multiple_p (offset, 16))
6085           {
6086             frame.reg_offset[regno] = max_int_offset;
6087             break;
6088           }
6089
6090         frame.reg_offset[regno] = offset;
6091         if (frame.wb_candidate1 == INVALID_REGNUM)
6092           frame.wb_candidate1 = regno;
6093         else if (frame.wb_candidate2 == INVALID_REGNUM
6094                  && frame.wb_candidate1 >= V0_REGNUM)
6095           frame.wb_candidate2 = regno;
6096         offset += vector_save_size;
6097       }
6098
6099   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6100
6101   frame.saved_regs_size = offset;
6102
6103   poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
6104
6105   poly_int64 above_outgoing_args
6106     = aligned_upper_bound (varargs_and_saved_regs_size
6107                            + get_frame_size (),
6108                            STACK_BOUNDARY / BITS_PER_UNIT);
6109
6110   frame.hard_fp_offset
6111     = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
6112
6113   /* Both these values are already aligned.  */
6114   gcc_assert (multiple_p (crtl->outgoing_args_size,
6115                           STACK_BOUNDARY / BITS_PER_UNIT));
6116   frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
6117
6118   frame.locals_offset = frame.saved_varargs_size;
6119
6120   frame.initial_adjust = 0;
6121   frame.final_adjust = 0;
6122   frame.callee_adjust = 0;
6123   frame.sve_callee_adjust = 0;
6124   frame.callee_offset = 0;
6125
6126   HOST_WIDE_INT max_push_offset = 0;
6127   if (frame.wb_candidate2 != INVALID_REGNUM)
6128     max_push_offset = 512;
6129   else if (frame.wb_candidate1 != INVALID_REGNUM)
6130     max_push_offset = 256;
6131
6132   HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
6133   HOST_WIDE_INT const_saved_regs_size;
6134   if (frame.frame_size.is_constant (&const_size)
6135       && const_size < max_push_offset
6136       && known_eq (frame.hard_fp_offset, const_size))
6137     {
6138       /* Simple, small frame with no outgoing arguments:
6139
6140          stp reg1, reg2, [sp, -frame_size]!
6141          stp reg3, reg4, [sp, 16]  */
6142       frame.callee_adjust = const_size;
6143     }
6144   else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
6145            && frame.saved_regs_size.is_constant (&const_saved_regs_size)
6146            && const_outgoing_args_size + const_saved_regs_size < 512
6147            /* We could handle this case even with outgoing args, provided
6148               that the number of args left us with valid offsets for all
6149               predicate and vector save slots.  It's such a rare case that
6150               it hardly seems worth the effort though.  */
6151            && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
6152            && !(cfun->calls_alloca
6153                 && frame.hard_fp_offset.is_constant (&const_fp_offset)
6154                 && const_fp_offset < max_push_offset))
6155     {
6156       /* Frame with small outgoing arguments:
6157
6158          sub sp, sp, frame_size
6159          stp reg1, reg2, [sp, outgoing_args_size]
6160          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
6161       frame.initial_adjust = frame.frame_size;
6162       frame.callee_offset = const_outgoing_args_size;
6163     }
6164   else if (saves_below_hard_fp_p
6165            && known_eq (frame.saved_regs_size,
6166                         frame.below_hard_fp_saved_regs_size))
6167     {
6168       /* Frame in which all saves are SVE saves:
6169
6170          sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
6171          save SVE registers relative to SP
6172          sub sp, sp, outgoing_args_size  */
6173       frame.initial_adjust = (frame.hard_fp_offset
6174                               + frame.below_hard_fp_saved_regs_size);
6175       frame.final_adjust = crtl->outgoing_args_size;
6176     }
6177   else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
6178            && const_fp_offset < max_push_offset)
6179     {
6180       /* Frame with large outgoing arguments or SVE saves, but with
6181          a small local area:
6182
6183          stp reg1, reg2, [sp, -hard_fp_offset]!
6184          stp reg3, reg4, [sp, 16]
6185          [sub sp, sp, below_hard_fp_saved_regs_size]
6186          [save SVE registers relative to SP]
6187          sub sp, sp, outgoing_args_size  */
6188       frame.callee_adjust = const_fp_offset;
6189       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
6190       frame.final_adjust = crtl->outgoing_args_size;
6191     }
6192   else
6193     {
6194       /* Frame with large local area and outgoing arguments or SVE saves,
6195          using frame pointer:
6196
6197          sub sp, sp, hard_fp_offset
6198          stp x29, x30, [sp, 0]
6199          add x29, sp, 0
6200          stp reg3, reg4, [sp, 16]
6201          [sub sp, sp, below_hard_fp_saved_regs_size]
6202          [save SVE registers relative to SP]
6203          sub sp, sp, outgoing_args_size  */
6204       frame.initial_adjust = frame.hard_fp_offset;
6205       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
6206       frame.final_adjust = crtl->outgoing_args_size;
6207     }
6208
6209   /* Make sure the individual adjustments add up to the full frame size.  */
6210   gcc_assert (known_eq (frame.initial_adjust
6211                         + frame.callee_adjust
6212                         + frame.sve_callee_adjust
6213                         + frame.final_adjust, frame.frame_size));
6214
6215   frame.laid_out = true;
6216 }
6217
6218 /* Return true if the register REGNO is saved on entry to
6219    the current function.  */
6220
6221 static bool
6222 aarch64_register_saved_on_entry (int regno)
6223 {
6224   return known_ge (cfun->machine->frame.reg_offset[regno], 0);
6225 }
6226
6227 /* Return the next register up from REGNO up to LIMIT for the callee
6228    to save.  */
6229
6230 static unsigned
6231 aarch64_next_callee_save (unsigned regno, unsigned limit)
6232 {
6233   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
6234     regno ++;
6235   return regno;
6236 }
6237
6238 /* Push the register number REGNO of mode MODE to the stack with write-back
6239    adjusting the stack by ADJUSTMENT.  */
6240
6241 static void
6242 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
6243                            HOST_WIDE_INT adjustment)
6244  {
6245   rtx base_rtx = stack_pointer_rtx;
6246   rtx insn, reg, mem;
6247
6248   reg = gen_rtx_REG (mode, regno);
6249   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
6250                             plus_constant (Pmode, base_rtx, -adjustment));
6251   mem = gen_frame_mem (mode, mem);
6252
6253   insn = emit_move_insn (mem, reg);
6254   RTX_FRAME_RELATED_P (insn) = 1;
6255 }
6256
6257 /* Generate and return an instruction to store the pair of registers
6258    REG and REG2 of mode MODE to location BASE with write-back adjusting
6259    the stack location BASE by ADJUSTMENT.  */
6260
6261 static rtx
6262 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
6263                           HOST_WIDE_INT adjustment)
6264 {
6265   switch (mode)
6266     {
6267     case E_DImode:
6268       return gen_storewb_pairdi_di (base, base, reg, reg2,
6269                                     GEN_INT (-adjustment),
6270                                     GEN_INT (UNITS_PER_WORD - adjustment));
6271     case E_DFmode:
6272       return gen_storewb_pairdf_di (base, base, reg, reg2,
6273                                     GEN_INT (-adjustment),
6274                                     GEN_INT (UNITS_PER_WORD - adjustment));
6275     case E_TFmode:
6276       return gen_storewb_pairtf_di (base, base, reg, reg2,
6277                                     GEN_INT (-adjustment),
6278                                     GEN_INT (UNITS_PER_VREG - adjustment));
6279     default:
6280       gcc_unreachable ();
6281     }
6282 }
6283
6284 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
6285    stack pointer by ADJUSTMENT.  */
6286
6287 static void
6288 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
6289 {
6290   rtx_insn *insn;
6291   machine_mode mode = aarch64_reg_save_mode (regno1);
6292
6293   if (regno2 == INVALID_REGNUM)
6294     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
6295
6296   rtx reg1 = gen_rtx_REG (mode, regno1);
6297   rtx reg2 = gen_rtx_REG (mode, regno2);
6298
6299   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
6300                                               reg2, adjustment));
6301   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
6302   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
6303   RTX_FRAME_RELATED_P (insn) = 1;
6304 }
6305
6306 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
6307    adjusting it by ADJUSTMENT afterwards.  */
6308
6309 static rtx
6310 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
6311                          HOST_WIDE_INT adjustment)
6312 {
6313   switch (mode)
6314     {
6315     case E_DImode:
6316       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
6317                                    GEN_INT (UNITS_PER_WORD));
6318     case E_DFmode:
6319       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
6320                                    GEN_INT (UNITS_PER_WORD));
6321     case E_TFmode:
6322       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
6323                                    GEN_INT (UNITS_PER_VREG));
6324     default:
6325       gcc_unreachable ();
6326     }
6327 }
6328
6329 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
6330    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
6331    into CFI_OPS.  */
6332
6333 static void
6334 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
6335                   rtx *cfi_ops)
6336 {
6337   machine_mode mode = aarch64_reg_save_mode (regno1);
6338   rtx reg1 = gen_rtx_REG (mode, regno1);
6339
6340   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
6341
6342   if (regno2 == INVALID_REGNUM)
6343     {
6344       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
6345       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
6346       emit_move_insn (reg1, gen_frame_mem (mode, mem));
6347     }
6348   else
6349     {
6350       rtx reg2 = gen_rtx_REG (mode, regno2);
6351       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
6352       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
6353                                           reg2, adjustment));
6354     }
6355 }
6356
6357 /* Generate and return a store pair instruction of mode MODE to store
6358    register REG1 to MEM1 and register REG2 to MEM2.  */
6359
6360 static rtx
6361 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
6362                         rtx reg2)
6363 {
6364   switch (mode)
6365     {
6366     case E_DImode:
6367       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
6368
6369     case E_DFmode:
6370       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
6371
6372     case E_TFmode:
6373       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
6374
6375     default:
6376       gcc_unreachable ();
6377     }
6378 }
6379
6380 /* Generate and regurn a load pair isntruction of mode MODE to load register
6381    REG1 from MEM1 and register REG2 from MEM2.  */
6382
6383 static rtx
6384 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
6385                        rtx mem2)
6386 {
6387   switch (mode)
6388     {
6389     case E_DImode:
6390       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
6391
6392     case E_DFmode:
6393       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
6394
6395     case E_TFmode:
6396       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
6397
6398     default:
6399       gcc_unreachable ();
6400     }
6401 }
6402
6403 /* Return TRUE if return address signing should be enabled for the current
6404    function, otherwise return FALSE.  */
6405
6406 bool
6407 aarch64_return_address_signing_enabled (void)
6408 {
6409   /* This function should only be called after frame laid out.   */
6410   gcc_assert (cfun->machine->frame.laid_out);
6411
6412   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
6413      if its LR is pushed onto stack.  */
6414   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
6415           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
6416               && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
6417 }
6418
6419 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
6420 bool
6421 aarch64_bti_enabled (void)
6422 {
6423   return (aarch64_enable_bti == 1);
6424 }
6425
6426 /* The caller is going to use ST1D or LD1D to save or restore an SVE
6427    register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
6428    the range [1, 16] * GET_MODE_SIZE (MODE).  Prepare for this by:
6429
6430      (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
6431          or LD1D address
6432
6433      (2) setting PRED to a valid predicate register for the ST1D or LD1D,
6434          if the variable isn't already nonnull
6435
6436    (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
6437    Handle this case using a temporary base register that is suitable for
6438    all offsets in that range.  Use ANCHOR_REG as this base register if it
6439    is nonnull, otherwise create a new register and store it in ANCHOR_REG.  */
6440
6441 static inline void
6442 aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
6443                                      rtx &anchor_reg, poly_int64 &offset,
6444                                      rtx &ptrue)
6445 {
6446   if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
6447     {
6448       /* This is the maximum valid offset of the anchor from the base.
6449          Lower values would be valid too.  */
6450       poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
6451       if (!anchor_reg)
6452         {
6453           anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6454           emit_insn (gen_add3_insn (anchor_reg, base_rtx,
6455                                     gen_int_mode (anchor_offset, Pmode)));
6456         }
6457       base_rtx = anchor_reg;
6458       offset -= anchor_offset;
6459     }
6460   if (!ptrue)
6461     {
6462       int pred_reg = cfun->machine->frame.spare_pred_reg;
6463       emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
6464                       CONSTM1_RTX (VNx16BImode));
6465       ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
6466     }
6467 }
6468
6469 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6470    is saved at BASE + OFFSET.  */
6471
6472 static void
6473 aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
6474                             rtx base, poly_int64 offset)
6475 {
6476   rtx mem = gen_frame_mem (GET_MODE (reg),
6477                            plus_constant (Pmode, base, offset));
6478   add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
6479 }
6480
6481 /* Emit code to save the callee-saved registers from register number START
6482    to LIMIT to the stack at the location starting at offset START_OFFSET,
6483    skipping any write-back candidates if SKIP_WB is true.  HARD_FP_VALID_P
6484    is true if the hard frame pointer has been set up.  */
6485
6486 static void
6487 aarch64_save_callee_saves (poly_int64 start_offset,
6488                            unsigned start, unsigned limit, bool skip_wb,
6489                            bool hard_fp_valid_p)
6490 {
6491   rtx_insn *insn;
6492   unsigned regno;
6493   unsigned regno2;
6494   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
6495
6496   for (regno = aarch64_next_callee_save (start, limit);
6497        regno <= limit;
6498        regno = aarch64_next_callee_save (regno + 1, limit))
6499     {
6500       rtx reg, mem;
6501       poly_int64 offset;
6502       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
6503
6504       if (skip_wb
6505           && (regno == cfun->machine->frame.wb_candidate1
6506               || regno == cfun->machine->frame.wb_candidate2))
6507         continue;
6508
6509       if (cfun->machine->reg_is_wrapped_separately[regno])
6510         continue;
6511
6512       machine_mode mode = aarch64_reg_save_mode (regno);
6513       reg = gen_rtx_REG (mode, regno);
6514       offset = start_offset + cfun->machine->frame.reg_offset[regno];
6515       rtx base_rtx = stack_pointer_rtx;
6516       poly_int64 sp_offset = offset;
6517
6518       HOST_WIDE_INT const_offset;
6519       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6520         aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
6521                                              offset, ptrue);
6522       else if (GP_REGNUM_P (regno)
6523                && (!offset.is_constant (&const_offset) || const_offset >= 512))
6524         {
6525           gcc_assert (known_eq (start_offset, 0));
6526           poly_int64 fp_offset
6527             = cfun->machine->frame.below_hard_fp_saved_regs_size;
6528           if (hard_fp_valid_p)
6529             base_rtx = hard_frame_pointer_rtx;
6530           else
6531             {
6532               if (!anchor_reg)
6533                 {
6534                   anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6535                   emit_insn (gen_add3_insn (anchor_reg, base_rtx,
6536                                             gen_int_mode (fp_offset, Pmode)));
6537                 }
6538               base_rtx = anchor_reg;
6539             }
6540           offset -= fp_offset;
6541         }
6542       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
6543       bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
6544
6545       if (!aarch64_sve_mode_p (mode)
6546           && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
6547           && !cfun->machine->reg_is_wrapped_separately[regno2]
6548           && known_eq (GET_MODE_SIZE (mode),
6549                        cfun->machine->frame.reg_offset[regno2]
6550                        - cfun->machine->frame.reg_offset[regno]))
6551         {
6552           rtx reg2 = gen_rtx_REG (mode, regno2);
6553           rtx mem2;
6554
6555           offset += GET_MODE_SIZE (mode);
6556           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
6557           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
6558                                                     reg2));
6559
6560           /* The first part of a frame-related parallel insn is
6561              always assumed to be relevant to the frame
6562              calculations; subsequent parts, are only
6563              frame-related if explicitly marked.  */
6564           if (aarch64_emit_cfi_for_reg_p (regno2))
6565             {
6566               if (need_cfa_note_p)
6567                 aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx,
6568                                             sp_offset + GET_MODE_SIZE (mode));
6569               else
6570                 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
6571             }
6572
6573           regno = regno2;
6574         }
6575       else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6576         {
6577           insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg));
6578           need_cfa_note_p = true;
6579         }
6580       else if (aarch64_sve_mode_p (mode))
6581         insn = emit_insn (gen_rtx_SET (mem, reg));
6582       else
6583         insn = emit_move_insn (mem, reg);
6584
6585       RTX_FRAME_RELATED_P (insn) = frame_related_p;
6586       if (frame_related_p && need_cfa_note_p)
6587         aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset);
6588     }
6589 }
6590
6591 /* Emit code to restore the callee registers from register number START
6592    up to and including LIMIT.  Restore from the stack offset START_OFFSET,
6593    skipping any write-back candidates if SKIP_WB is true.  Write the
6594    appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
6595
6596 static void
6597 aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
6598                               unsigned limit, bool skip_wb, rtx *cfi_ops)
6599 {
6600   unsigned regno;
6601   unsigned regno2;
6602   poly_int64 offset;
6603   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
6604
6605   for (regno = aarch64_next_callee_save (start, limit);
6606        regno <= limit;
6607        regno = aarch64_next_callee_save (regno + 1, limit))
6608     {
6609       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
6610       if (cfun->machine->reg_is_wrapped_separately[regno])
6611         continue;
6612
6613       rtx reg, mem;
6614
6615       if (skip_wb
6616           && (regno == cfun->machine->frame.wb_candidate1
6617               || regno == cfun->machine->frame.wb_candidate2))
6618         continue;
6619
6620       machine_mode mode = aarch64_reg_save_mode (regno);
6621       reg = gen_rtx_REG (mode, regno);
6622       offset = start_offset + cfun->machine->frame.reg_offset[regno];
6623       rtx base_rtx = stack_pointer_rtx;
6624       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6625         aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
6626                                              offset, ptrue);
6627       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
6628
6629       if (!aarch64_sve_mode_p (mode)
6630           && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
6631           && !cfun->machine->reg_is_wrapped_separately[regno2]
6632           && known_eq (GET_MODE_SIZE (mode),
6633                        cfun->machine->frame.reg_offset[regno2]
6634                        - cfun->machine->frame.reg_offset[regno]))
6635         {
6636           rtx reg2 = gen_rtx_REG (mode, regno2);
6637           rtx mem2;
6638
6639           offset += GET_MODE_SIZE (mode);
6640           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
6641           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
6642
6643           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
6644           regno = regno2;
6645         }
6646       else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6647         emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
6648       else if (aarch64_sve_mode_p (mode))
6649         emit_insn (gen_rtx_SET (reg, mem));
6650       else
6651         emit_move_insn (reg, mem);
6652       if (frame_related_p)
6653         *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
6654     }
6655 }
6656
6657 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
6658    of MODE.  */
6659
6660 static inline bool
6661 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
6662 {
6663   HOST_WIDE_INT multiple;
6664   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6665           && IN_RANGE (multiple, -8, 7));
6666 }
6667
6668 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
6669    of MODE.  */
6670
6671 static inline bool
6672 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
6673 {
6674   HOST_WIDE_INT multiple;
6675   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6676           && IN_RANGE (multiple, 0, 63));
6677 }
6678
6679 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
6680    of MODE.  */
6681
6682 bool
6683 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
6684 {
6685   HOST_WIDE_INT multiple;
6686   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6687           && IN_RANGE (multiple, -64, 63));
6688 }
6689
6690 /* Return true if OFFSET is a signed 9-bit value.  */
6691
6692 bool
6693 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
6694                                        poly_int64 offset)
6695 {
6696   HOST_WIDE_INT const_offset;
6697   return (offset.is_constant (&const_offset)
6698           && IN_RANGE (const_offset, -256, 255));
6699 }
6700
6701 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
6702    of MODE.  */
6703
6704 static inline bool
6705 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
6706 {
6707   HOST_WIDE_INT multiple;
6708   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6709           && IN_RANGE (multiple, -256, 255));
6710 }
6711
6712 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
6713    of MODE.  */
6714
6715 static inline bool
6716 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
6717 {
6718   HOST_WIDE_INT multiple;
6719   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6720           && IN_RANGE (multiple, 0, 4095));
6721 }
6722
6723 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
6724
6725 static sbitmap
6726 aarch64_get_separate_components (void)
6727 {
6728   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
6729   bitmap_clear (components);
6730
6731   /* The registers we need saved to the frame.  */
6732   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6733     if (aarch64_register_saved_on_entry (regno))
6734       {
6735         /* Punt on saves and restores that use ST1D and LD1D.  We could
6736            try to be smarter, but it would involve making sure that the
6737            spare predicate register itself is safe to use at the save
6738            and restore points.  Also, when a frame pointer is being used,
6739            the slots are often out of reach of ST1D and LD1D anyway.  */
6740         machine_mode mode = aarch64_reg_save_mode (regno);
6741         if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6742           continue;
6743
6744         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
6745
6746         /* If the register is saved in the first SVE save slot, we use
6747            it as a stack probe for -fstack-clash-protection.  */
6748         if (flag_stack_clash_protection
6749             && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
6750             && known_eq (offset, 0))
6751           continue;
6752
6753         /* Get the offset relative to the register we'll use.  */
6754         if (frame_pointer_needed)
6755           offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
6756         else
6757           offset += crtl->outgoing_args_size;
6758
6759         /* Check that we can access the stack slot of the register with one
6760            direct load with no adjustments needed.  */
6761         if (aarch64_sve_mode_p (mode)
6762             ? offset_9bit_signed_scaled_p (mode, offset)
6763             : offset_12bit_unsigned_scaled_p (mode, offset))
6764           bitmap_set_bit (components, regno);
6765       }
6766
6767   /* Don't mess with the hard frame pointer.  */
6768   if (frame_pointer_needed)
6769     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
6770
6771   /* If the spare predicate register used by big-endian SVE code
6772      is call-preserved, it must be saved in the main prologue
6773      before any saves that use it.  */
6774   if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
6775     bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
6776
6777   unsigned reg1 = cfun->machine->frame.wb_candidate1;
6778   unsigned reg2 = cfun->machine->frame.wb_candidate2;
6779   /* If registers have been chosen to be stored/restored with
6780      writeback don't interfere with them to avoid having to output explicit
6781      stack adjustment instructions.  */
6782   if (reg2 != INVALID_REGNUM)
6783     bitmap_clear_bit (components, reg2);
6784   if (reg1 != INVALID_REGNUM)
6785     bitmap_clear_bit (components, reg1);
6786
6787   bitmap_clear_bit (components, LR_REGNUM);
6788   bitmap_clear_bit (components, SP_REGNUM);
6789
6790   return components;
6791 }
6792
6793 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
6794
6795 static sbitmap
6796 aarch64_components_for_bb (basic_block bb)
6797 {
6798   bitmap in = DF_LIVE_IN (bb);
6799   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
6800   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
6801
6802   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
6803   bitmap_clear (components);
6804
6805   /* Clobbered registers don't generate values in any meaningful sense,
6806      since nothing after the clobber can rely on their value.  And we can't
6807      say that partially-clobbered registers are unconditionally killed,
6808      because whether they're killed or not depends on the mode of the
6809      value they're holding.  Thus partially call-clobbered registers
6810      appear in neither the kill set nor the gen set.
6811
6812      Check manually for any calls that clobber more of a register than the
6813      current function can.  */
6814   function_abi_aggregator callee_abis;
6815   rtx_insn *insn;
6816   FOR_BB_INSNS (bb, insn)
6817     if (CALL_P (insn))
6818       callee_abis.note_callee_abi (insn_callee_abi (insn));
6819   HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
6820
6821   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
6822   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6823     if (!fixed_regs[regno]
6824         && !crtl->abi->clobbers_full_reg_p (regno)
6825         && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
6826             || bitmap_bit_p (in, regno)
6827             || bitmap_bit_p (gen, regno)
6828             || bitmap_bit_p (kill, regno)))
6829       {
6830         bitmap_set_bit (components, regno);
6831
6832         /* If there is a callee-save at an adjacent offset, add it too
6833            to increase the use of LDP/STP.  */
6834         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
6835         unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
6836
6837         if (regno2 <= LAST_SAVED_REGNUM)
6838           {
6839             poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
6840             if (regno < regno2
6841                 ? known_eq (offset + 8, offset2)
6842                 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
6843               bitmap_set_bit (components, regno2);
6844           }
6845       }
6846
6847   return components;
6848 }
6849
6850 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
6851    Nothing to do for aarch64.  */
6852
6853 static void
6854 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
6855 {
6856 }
6857
6858 /* Return the next set bit in BMP from START onwards.  Return the total number
6859    of bits in BMP if no set bit is found at or after START.  */
6860
6861 static unsigned int
6862 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
6863 {
6864   unsigned int nbits = SBITMAP_SIZE (bmp);
6865   if (start == nbits)
6866     return start;
6867
6868   gcc_assert (start < nbits);
6869   for (unsigned int i = start; i < nbits; i++)
6870     if (bitmap_bit_p (bmp, i))
6871       return i;
6872
6873   return nbits;
6874 }
6875
6876 /* Do the work for aarch64_emit_prologue_components and
6877    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
6878    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
6879    for these components or the epilogue sequence.  That is, it determines
6880    whether we should emit stores or loads and what kind of CFA notes to attach
6881    to the insns.  Otherwise the logic for the two sequences is very
6882    similar.  */
6883
6884 static void
6885 aarch64_process_components (sbitmap components, bool prologue_p)
6886 {
6887   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
6888                              ? HARD_FRAME_POINTER_REGNUM
6889                              : STACK_POINTER_REGNUM);
6890
6891   unsigned last_regno = SBITMAP_SIZE (components);
6892   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
6893   rtx_insn *insn = NULL;
6894
6895   while (regno != last_regno)
6896     {
6897       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
6898       machine_mode mode = aarch64_reg_save_mode (regno);
6899
6900       rtx reg = gen_rtx_REG (mode, regno);
6901       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
6902       if (frame_pointer_needed)
6903         offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
6904       else
6905         offset += crtl->outgoing_args_size;
6906
6907       rtx addr = plus_constant (Pmode, ptr_reg, offset);
6908       rtx mem = gen_frame_mem (mode, addr);
6909
6910       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
6911       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
6912       /* No more registers to handle after REGNO.
6913          Emit a single save/restore and exit.  */
6914       if (regno2 == last_regno)
6915         {
6916           insn = emit_insn (set);
6917           if (frame_related_p)
6918             {
6919               RTX_FRAME_RELATED_P (insn) = 1;
6920               if (prologue_p)
6921                 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6922               else
6923                 add_reg_note (insn, REG_CFA_RESTORE, reg);
6924             }
6925           break;
6926         }
6927
6928       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
6929       /* The next register is not of the same class or its offset is not
6930          mergeable with the current one into a pair.  */
6931       if (aarch64_sve_mode_p (mode)
6932           || !satisfies_constraint_Ump (mem)
6933           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
6934           || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
6935           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
6936                        GET_MODE_SIZE (mode)))
6937         {
6938           insn = emit_insn (set);
6939           if (frame_related_p)
6940             {
6941               RTX_FRAME_RELATED_P (insn) = 1;
6942               if (prologue_p)
6943                 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6944               else
6945                 add_reg_note (insn, REG_CFA_RESTORE, reg);
6946             }
6947
6948           regno = regno2;
6949           continue;
6950         }
6951
6952       bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
6953
6954       /* REGNO2 can be saved/restored in a pair with REGNO.  */
6955       rtx reg2 = gen_rtx_REG (mode, regno2);
6956       if (frame_pointer_needed)
6957         offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
6958       else
6959         offset2 += crtl->outgoing_args_size;
6960       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
6961       rtx mem2 = gen_frame_mem (mode, addr2);
6962       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
6963                              : gen_rtx_SET (reg2, mem2);
6964
6965       if (prologue_p)
6966         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
6967       else
6968         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
6969
6970       if (frame_related_p || frame_related2_p)
6971         {
6972           RTX_FRAME_RELATED_P (insn) = 1;
6973           if (prologue_p)
6974             {
6975               if (frame_related_p)
6976                 add_reg_note (insn, REG_CFA_OFFSET, set);
6977               if (frame_related2_p)
6978                 add_reg_note (insn, REG_CFA_OFFSET, set2);
6979             }
6980           else
6981             {
6982               if (frame_related_p)
6983                 add_reg_note (insn, REG_CFA_RESTORE, reg);
6984               if (frame_related2_p)
6985                 add_reg_note (insn, REG_CFA_RESTORE, reg2);
6986             }
6987         }
6988
6989       regno = aarch64_get_next_set_bit (components, regno2 + 1);
6990     }
6991 }
6992
6993 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
6994
6995 static void
6996 aarch64_emit_prologue_components (sbitmap components)
6997 {
6998   aarch64_process_components (components, true);
6999 }
7000
7001 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
7002
7003 static void
7004 aarch64_emit_epilogue_components (sbitmap components)
7005 {
7006   aarch64_process_components (components, false);
7007 }
7008
7009 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
7010
7011 static void
7012 aarch64_set_handled_components (sbitmap components)
7013 {
7014   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7015     if (bitmap_bit_p (components, regno))
7016       cfun->machine->reg_is_wrapped_separately[regno] = true;
7017 }
7018
7019 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
7020    determining the probe offset for alloca.  */
7021
7022 static HOST_WIDE_INT
7023 aarch64_stack_clash_protection_alloca_probe_range (void)
7024 {
7025   return STACK_CLASH_CALLER_GUARD;
7026 }
7027
7028
7029 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
7030    registers.  If POLY_SIZE is not large enough to require a probe this function
7031    will only adjust the stack.  When allocating the stack space
7032    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
7033    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
7034    arguments.  If we are then we ensure that any allocation larger than the ABI
7035    defined buffer needs a probe so that the invariant of having a 1KB buffer is
7036    maintained.
7037
7038    We emit barriers after each stack adjustment to prevent optimizations from
7039    breaking the invariant that we never drop the stack more than a page.  This
7040    invariant is needed to make it easier to correctly handle asynchronous
7041    events, e.g. if we were to allow the stack to be dropped by more than a page
7042    and then have multiple probes up and we take a signal somewhere in between
7043    then the signal handler doesn't know the state of the stack and can make no
7044    assumptions about which pages have been probed.  */
7045
7046 static void
7047 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
7048                                         poly_int64 poly_size,
7049                                         bool frame_related_p,
7050                                         bool final_adjustment_p)
7051 {
7052   HOST_WIDE_INT guard_size
7053     = 1 << param_stack_clash_protection_guard_size;
7054   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
7055   HOST_WIDE_INT min_probe_threshold
7056     = (final_adjustment_p
7057        ? guard_used_by_caller
7058        : guard_size - guard_used_by_caller);
7059   /* When doing the final adjustment for the outgoing arguments, take into
7060      account any unprobed space there is above the current SP.  There are
7061      two cases:
7062
7063      - When saving SVE registers below the hard frame pointer, we force
7064        the lowest save to take place in the prologue before doing the final
7065        adjustment (i.e. we don't allow the save to be shrink-wrapped).
7066        This acts as a probe at SP, so there is no unprobed space.
7067
7068      - When there are no SVE register saves, we use the store of the link
7069        register as a probe.  We can't assume that LR was saved at position 0
7070        though, so treat any space below it as unprobed.  */
7071   if (final_adjustment_p
7072       && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
7073     {
7074       poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
7075       if (known_ge (lr_offset, 0))
7076         min_probe_threshold -= lr_offset.to_constant ();
7077       else
7078         gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
7079     }
7080
7081   poly_int64 frame_size = cfun->machine->frame.frame_size;
7082
7083   /* We should always have a positive probe threshold.  */
7084   gcc_assert (min_probe_threshold > 0);
7085
7086   if (flag_stack_clash_protection && !final_adjustment_p)
7087     {
7088       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
7089       poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
7090       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7091
7092       if (known_eq (frame_size, 0))
7093         {
7094           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
7095         }
7096       else if (known_lt (initial_adjust + sve_callee_adjust,
7097                          guard_size - guard_used_by_caller)
7098                && known_lt (final_adjust, guard_used_by_caller))
7099         {
7100           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
7101         }
7102     }
7103
7104   /* If SIZE is not large enough to require probing, just adjust the stack and
7105      exit.  */
7106   if (known_lt (poly_size, min_probe_threshold)
7107       || !flag_stack_clash_protection)
7108     {
7109       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
7110       return;
7111     }
7112
7113   HOST_WIDE_INT size;
7114   /* Handle the SVE non-constant case first.  */
7115   if (!poly_size.is_constant (&size))
7116     {
7117      if (dump_file)
7118       {
7119         fprintf (dump_file, "Stack clash SVE prologue: ");
7120         print_dec (poly_size, dump_file);
7121         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
7122       }
7123
7124       /* First calculate the amount of bytes we're actually spilling.  */
7125       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
7126                           poly_size, temp1, temp2, false, true);
7127
7128       rtx_insn *insn = get_last_insn ();
7129
7130       if (frame_related_p)
7131         {
7132           /* This is done to provide unwinding information for the stack
7133              adjustments we're about to do, however to prevent the optimizers
7134              from removing the R11 move and leaving the CFA note (which would be
7135              very wrong) we tie the old and new stack pointer together.
7136              The tie will expand to nothing but the optimizers will not touch
7137              the instruction.  */
7138           rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
7139           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
7140           emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
7141
7142           /* We want the CFA independent of the stack pointer for the
7143              duration of the loop.  */
7144           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
7145           RTX_FRAME_RELATED_P (insn) = 1;
7146         }
7147
7148       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
7149       rtx guard_const = gen_int_mode (guard_size, Pmode);
7150
7151       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
7152                                                    stack_pointer_rtx, temp1,
7153                                                    probe_const, guard_const));
7154
7155       /* Now reset the CFA register if needed.  */
7156       if (frame_related_p)
7157         {
7158           add_reg_note (insn, REG_CFA_DEF_CFA,
7159                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
7160                                       gen_int_mode (poly_size, Pmode)));
7161           RTX_FRAME_RELATED_P (insn) = 1;
7162         }
7163
7164       return;
7165     }
7166
7167   if (dump_file)
7168     fprintf (dump_file,
7169              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
7170              " bytes, probing will be required.\n", size);
7171
7172   /* Round size to the nearest multiple of guard_size, and calculate the
7173      residual as the difference between the original size and the rounded
7174      size.  */
7175   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
7176   HOST_WIDE_INT residual = size - rounded_size;
7177
7178   /* We can handle a small number of allocations/probes inline.  Otherwise
7179      punt to a loop.  */
7180   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
7181     {
7182       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
7183         {
7184           aarch64_sub_sp (NULL, temp2, guard_size, true);
7185           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
7186                                            guard_used_by_caller));
7187           emit_insn (gen_blockage ());
7188         }
7189       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
7190     }
7191   else
7192     {
7193       /* Compute the ending address.  */
7194       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
7195                           temp1, NULL, false, true);
7196       rtx_insn *insn = get_last_insn ();
7197
7198       /* For the initial allocation, we don't have a frame pointer
7199          set up, so we always need CFI notes.  If we're doing the
7200          final allocation, then we may have a frame pointer, in which
7201          case it is the CFA, otherwise we need CFI notes.
7202
7203          We can determine which allocation we are doing by looking at
7204          the value of FRAME_RELATED_P since the final allocations are not
7205          frame related.  */
7206       if (frame_related_p)
7207         {
7208           /* We want the CFA independent of the stack pointer for the
7209              duration of the loop.  */
7210           add_reg_note (insn, REG_CFA_DEF_CFA,
7211                         plus_constant (Pmode, temp1, rounded_size));
7212           RTX_FRAME_RELATED_P (insn) = 1;
7213         }
7214
7215       /* This allocates and probes the stack.  Note that this re-uses some of
7216          the existing Ada stack protection code.  However we are guaranteed not
7217          to enter the non loop or residual branches of that code.
7218
7219          The non-loop part won't be entered because if our allocation amount
7220          doesn't require a loop, the case above would handle it.
7221
7222          The residual amount won't be entered because TEMP1 is a mutliple of
7223          the allocation size.  The residual will always be 0.  As such, the only
7224          part we are actually using from that code is the loop setup.  The
7225          actual probing is done in aarch64_output_probe_stack_range.  */
7226       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
7227                                                stack_pointer_rtx, temp1));
7228
7229       /* Now reset the CFA register if needed.  */
7230       if (frame_related_p)
7231         {
7232           add_reg_note (insn, REG_CFA_DEF_CFA,
7233                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
7234           RTX_FRAME_RELATED_P (insn) = 1;
7235         }
7236
7237       emit_insn (gen_blockage ());
7238       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
7239     }
7240
7241   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
7242      be probed.  This maintains the requirement that each page is probed at
7243      least once.  For initial probing we probe only if the allocation is
7244      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
7245      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
7246      GUARD_SIZE.  This works that for any allocation that is large enough to
7247      trigger a probe here, we'll have at least one, and if they're not large
7248      enough for this code to emit anything for them, The page would have been
7249      probed by the saving of FP/LR either by this function or any callees.  If
7250      we don't have any callees then we won't have more stack adjustments and so
7251      are still safe.  */
7252   if (residual)
7253     {
7254       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
7255       /* If we're doing final adjustments, and we've done any full page
7256          allocations then any residual needs to be probed.  */
7257       if (final_adjustment_p && rounded_size != 0)
7258         min_probe_threshold = 0;
7259       /* If doing a small final adjustment, we always probe at offset 0.
7260          This is done to avoid issues when LR is not at position 0 or when
7261          the final adjustment is smaller than the probing offset.  */
7262       else if (final_adjustment_p && rounded_size == 0)
7263         residual_probe_offset = 0;
7264
7265       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
7266       if (residual >= min_probe_threshold)
7267         {
7268           if (dump_file)
7269             fprintf (dump_file,
7270                      "Stack clash AArch64 prologue residuals: "
7271                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
7272                      "\n", residual);
7273
7274             emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
7275                                              residual_probe_offset));
7276           emit_insn (gen_blockage ());
7277         }
7278     }
7279 }
7280
7281 /* Return 1 if the register is used by the epilogue.  We need to say the
7282    return register is used, but only after epilogue generation is complete.
7283    Note that in the case of sibcalls, the values "used by the epilogue" are
7284    considered live at the start of the called function.
7285
7286    For SIMD functions we need to return 1 for FP registers that are saved and
7287    restored by a function but are not zero in call_used_regs.  If we do not do
7288    this optimizations may remove the restore of the register.  */
7289
7290 int
7291 aarch64_epilogue_uses (int regno)
7292 {
7293   if (epilogue_completed)
7294     {
7295       if (regno == LR_REGNUM)
7296         return 1;
7297     }
7298   return 0;
7299 }
7300
7301 /* AArch64 stack frames generated by this compiler look like:
7302
7303         +-------------------------------+
7304         |                               |
7305         |  incoming stack arguments     |
7306         |                               |
7307         +-------------------------------+
7308         |                               | <-- incoming stack pointer (aligned)
7309         |  callee-allocated save area   |
7310         |  for register varargs         |
7311         |                               |
7312         +-------------------------------+
7313         |  local variables              | <-- frame_pointer_rtx
7314         |                               |
7315         +-------------------------------+
7316         |  padding                      | \
7317         +-------------------------------+  |
7318         |  callee-saved registers       |  | frame.saved_regs_size
7319         +-------------------------------+  |
7320         |  LR'                          |  |
7321         +-------------------------------+  |
7322         |  FP'                          |  |
7323         +-------------------------------+  |<- hard_frame_pointer_rtx (aligned)
7324         |  SVE vector registers         |  | \
7325         +-------------------------------+  |  | below_hard_fp_saved_regs_size
7326         |  SVE predicate registers      | /  /
7327         +-------------------------------+
7328         |  dynamic allocation           |
7329         +-------------------------------+
7330         |  padding                      |
7331         +-------------------------------+
7332         |  outgoing stack arguments     | <-- arg_pointer
7333         |                               |
7334         +-------------------------------+
7335         |                               | <-- stack_pointer_rtx (aligned)
7336
7337    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
7338    but leave frame_pointer_rtx and hard_frame_pointer_rtx
7339    unchanged.
7340
7341    By default for stack-clash we assume the guard is at least 64KB, but this
7342    value is configurable to either 4KB or 64KB.  We also force the guard size to
7343    be the same as the probing interval and both values are kept in sync.
7344
7345    With those assumptions the callee can allocate up to 63KB (or 3KB depending
7346    on the guard size) of stack space without probing.
7347
7348    When probing is needed, we emit a probe at the start of the prologue
7349    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
7350
7351    We have to track how much space has been allocated and the only stores
7352    to the stack we track as implicit probes are the FP/LR stores.
7353
7354    For outgoing arguments we probe if the size is larger than 1KB, such that
7355    the ABI specified buffer is maintained for the next callee.
7356
7357    The following registers are reserved during frame layout and should not be
7358    used for any other purpose:
7359
7360    - r11: Used by stack clash protection when SVE is enabled, and also
7361           as an anchor register when saving and restoring registers
7362    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
7363    - r14 and r15: Used for speculation tracking.
7364    - r16(IP0), r17(IP1): Used by indirect tailcalls.
7365    - r30(LR), r29(FP): Used by standard frame layout.
7366
7367    These registers must be avoided in frame layout related code unless the
7368    explicit intention is to interact with one of the features listed above.  */
7369
7370 /* Generate the prologue instructions for entry into a function.
7371    Establish the stack frame by decreasing the stack pointer with a
7372    properly calculated size and, if necessary, create a frame record
7373    filled with the values of LR and previous frame pointer.  The
7374    current FP is also set up if it is in use.  */
7375
7376 void
7377 aarch64_expand_prologue (void)
7378 {
7379   poly_int64 frame_size = cfun->machine->frame.frame_size;
7380   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
7381   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
7382   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7383   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
7384   poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
7385   poly_int64 below_hard_fp_saved_regs_size
7386     = cfun->machine->frame.below_hard_fp_saved_regs_size;
7387   unsigned reg1 = cfun->machine->frame.wb_candidate1;
7388   unsigned reg2 = cfun->machine->frame.wb_candidate2;
7389   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
7390   rtx_insn *insn;
7391
7392   if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
7393     {
7394       /* Fold the SVE allocation into the initial allocation.
7395          We don't do this in aarch64_layout_arg to avoid pessimizing
7396          the epilogue code.  */
7397       initial_adjust += sve_callee_adjust;
7398       sve_callee_adjust = 0;
7399     }
7400
7401   /* Sign return address for functions.  */
7402   if (aarch64_return_address_signing_enabled ())
7403     {
7404       switch (aarch64_ra_sign_key)
7405         {
7406           case AARCH64_KEY_A:
7407             insn = emit_insn (gen_paciasp ());
7408             break;
7409           case AARCH64_KEY_B:
7410             insn = emit_insn (gen_pacibsp ());
7411             break;
7412           default:
7413             gcc_unreachable ();
7414         }
7415       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
7416       RTX_FRAME_RELATED_P (insn) = 1;
7417     }
7418
7419   if (flag_stack_usage_info)
7420     current_function_static_stack_size = constant_lower_bound (frame_size);
7421
7422   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
7423     {
7424       if (crtl->is_leaf && !cfun->calls_alloca)
7425         {
7426           if (maybe_gt (frame_size, PROBE_INTERVAL)
7427               && maybe_gt (frame_size, get_stack_check_protect ()))
7428             aarch64_emit_probe_stack_range (get_stack_check_protect (),
7429                                             (frame_size
7430                                              - get_stack_check_protect ()));
7431         }
7432       else if (maybe_gt (frame_size, 0))
7433         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
7434     }
7435
7436   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
7437   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
7438
7439   /* In theory we should never have both an initial adjustment
7440      and a callee save adjustment.  Verify that is the case since the
7441      code below does not handle it for -fstack-clash-protection.  */
7442   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
7443
7444   /* Will only probe if the initial adjustment is larger than the guard
7445      less the amount of the guard reserved for use by the caller's
7446      outgoing args.  */
7447   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
7448                                           true, false);
7449
7450   if (callee_adjust != 0)
7451     aarch64_push_regs (reg1, reg2, callee_adjust);
7452
7453   /* The offset of the frame chain record (if any) from the current SP.  */
7454   poly_int64 chain_offset = (initial_adjust + callee_adjust
7455                              - cfun->machine->frame.hard_fp_offset);
7456   gcc_assert (known_ge (chain_offset, 0));
7457
7458   /* The offset of the bottom of the save area from the current SP.  */
7459   poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
7460
7461   if (emit_frame_chain)
7462     {
7463       if (callee_adjust == 0)
7464         {
7465           reg1 = R29_REGNUM;
7466           reg2 = R30_REGNUM;
7467           aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
7468                                      false, false);
7469         }
7470       else
7471         gcc_assert (known_eq (chain_offset, 0));
7472       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
7473                           stack_pointer_rtx, chain_offset,
7474                           tmp1_rtx, tmp0_rtx, frame_pointer_needed);
7475       if (frame_pointer_needed && !frame_size.is_constant ())
7476         {
7477           /* Variable-sized frames need to describe the save slot
7478              address using DW_CFA_expression rather than DW_CFA_offset.
7479              This means that, without taking further action, the
7480              locations of the registers that we've already saved would
7481              remain based on the stack pointer even after we redefine
7482              the CFA based on the frame pointer.  We therefore need new
7483              DW_CFA_expressions to re-express the save slots with addresses
7484              based on the frame pointer.  */
7485           rtx_insn *insn = get_last_insn ();
7486           gcc_assert (RTX_FRAME_RELATED_P (insn));
7487
7488           /* Add an explicit CFA definition if this was previously
7489              implicit.  */
7490           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
7491             {
7492               rtx src = plus_constant (Pmode, stack_pointer_rtx,
7493                                        callee_offset);
7494               add_reg_note (insn, REG_CFA_ADJUST_CFA,
7495                             gen_rtx_SET (hard_frame_pointer_rtx, src));
7496             }
7497
7498           /* Change the save slot expressions for the registers that
7499              we've already saved.  */
7500           aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
7501                                       hard_frame_pointer_rtx, UNITS_PER_WORD);
7502           aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
7503                                       hard_frame_pointer_rtx, 0);
7504         }
7505       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
7506     }
7507
7508   aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
7509                              callee_adjust != 0 || emit_frame_chain,
7510                              emit_frame_chain);
7511   if (maybe_ne (sve_callee_adjust, 0))
7512     {
7513       gcc_assert (!flag_stack_clash_protection
7514                   || known_eq (initial_adjust, 0));
7515       aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
7516                                               sve_callee_adjust,
7517                                               !frame_pointer_needed, false);
7518       saved_regs_offset += sve_callee_adjust;
7519     }
7520   aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
7521                              false, emit_frame_chain);
7522   aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
7523                              callee_adjust != 0 || emit_frame_chain,
7524                              emit_frame_chain);
7525
7526   /* We may need to probe the final adjustment if it is larger than the guard
7527      that is assumed by the called.  */
7528   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
7529                                           !frame_pointer_needed, true);
7530 }
7531
7532 /* Return TRUE if we can use a simple_return insn.
7533
7534    This function checks whether the callee saved stack is empty, which
7535    means no restore actions are need. The pro_and_epilogue will use
7536    this to check whether shrink-wrapping opt is feasible.  */
7537
7538 bool
7539 aarch64_use_return_insn_p (void)
7540 {
7541   if (!reload_completed)
7542     return false;
7543
7544   if (crtl->profile)
7545     return false;
7546
7547   return known_eq (cfun->machine->frame.frame_size, 0);
7548 }
7549
7550 /* Generate the epilogue instructions for returning from a function.
7551    This is almost exactly the reverse of the prolog sequence, except
7552    that we need to insert barriers to avoid scheduling loads that read
7553    from a deallocated stack, and we optimize the unwind records by
7554    emitting them all together if possible.  */
7555 void
7556 aarch64_expand_epilogue (bool for_sibcall)
7557 {
7558   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
7559   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
7560   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7561   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
7562   poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
7563   poly_int64 below_hard_fp_saved_regs_size
7564     = cfun->machine->frame.below_hard_fp_saved_regs_size;
7565   unsigned reg1 = cfun->machine->frame.wb_candidate1;
7566   unsigned reg2 = cfun->machine->frame.wb_candidate2;
7567   rtx cfi_ops = NULL;
7568   rtx_insn *insn;
7569   /* A stack clash protection prologue may not have left EP0_REGNUM or
7570      EP1_REGNUM in a usable state.  The same is true for allocations
7571      with an SVE component, since we then need both temporary registers
7572      for each allocation.  For stack clash we are in a usable state if
7573      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
7574   HOST_WIDE_INT guard_size
7575     = 1 << param_stack_clash_protection_guard_size;
7576   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
7577
7578   /* We can re-use the registers when:
7579
7580      (a) the deallocation amount is the same as the corresponding
7581          allocation amount (which is false if we combine the initial
7582          and SVE callee save allocations in the prologue); and
7583
7584      (b) the allocation amount doesn't need a probe (which is false
7585          if the amount is guard_size - guard_used_by_caller or greater).
7586
7587      In such situations the register should remain live with the correct
7588      value.  */
7589   bool can_inherit_p = (initial_adjust.is_constant ()
7590                         && final_adjust.is_constant ()
7591                         && (!flag_stack_clash_protection
7592                             || (known_lt (initial_adjust,
7593                                           guard_size - guard_used_by_caller)
7594                                 && known_eq (sve_callee_adjust, 0))));
7595
7596   /* We need to add memory barrier to prevent read from deallocated stack.  */
7597   bool need_barrier_p
7598     = maybe_ne (get_frame_size ()
7599                 + cfun->machine->frame.saved_varargs_size, 0);
7600
7601   /* Emit a barrier to prevent loads from a deallocated stack.  */
7602   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
7603       || cfun->calls_alloca
7604       || crtl->calls_eh_return)
7605     {
7606       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
7607       need_barrier_p = false;
7608     }
7609
7610   /* Restore the stack pointer from the frame pointer if it may not
7611      be the same as the stack pointer.  */
7612   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
7613   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
7614   if (frame_pointer_needed
7615       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
7616     /* If writeback is used when restoring callee-saves, the CFA
7617        is restored on the instruction doing the writeback.  */
7618     aarch64_add_offset (Pmode, stack_pointer_rtx,
7619                         hard_frame_pointer_rtx,
7620                         -callee_offset - below_hard_fp_saved_regs_size,
7621                         tmp1_rtx, tmp0_rtx, callee_adjust == 0);
7622   else
7623      /* The case where we need to re-use the register here is very rare, so
7624         avoid the complicated condition and just always emit a move if the
7625         immediate doesn't fit.  */
7626      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
7627
7628   /* Restore the vector registers before the predicate registers,
7629      so that we can use P4 as a temporary for big-endian SVE frames.  */
7630   aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
7631                                 callee_adjust != 0, &cfi_ops);
7632   aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
7633                                 false, &cfi_ops);
7634   if (maybe_ne (sve_callee_adjust, 0))
7635     aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
7636   aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
7637                                 R0_REGNUM, R30_REGNUM,
7638                                 callee_adjust != 0, &cfi_ops);
7639
7640   if (need_barrier_p)
7641     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
7642
7643   if (callee_adjust != 0)
7644     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
7645
7646   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
7647     {
7648       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
7649       insn = get_last_insn ();
7650       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
7651       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
7652       RTX_FRAME_RELATED_P (insn) = 1;
7653       cfi_ops = NULL;
7654     }
7655
7656   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
7657      add restriction on emit_move optimization to leaf functions.  */
7658   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
7659                   (!can_inherit_p || !crtl->is_leaf
7660                    || df_regs_ever_live_p (EP0_REGNUM)));
7661
7662   if (cfi_ops)
7663     {
7664       /* Emit delayed restores and reset the CFA to be SP.  */
7665       insn = get_last_insn ();
7666       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
7667       REG_NOTES (insn) = cfi_ops;
7668       RTX_FRAME_RELATED_P (insn) = 1;
7669     }
7670
7671   /* We prefer to emit the combined return/authenticate instruction RETAA,
7672      however there are three cases in which we must instead emit an explicit
7673      authentication instruction.
7674
7675         1) Sibcalls don't return in a normal way, so if we're about to call one
7676            we must authenticate.
7677
7678         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
7679            generating code for !TARGET_ARMV8_3 we can't use it and must
7680            explicitly authenticate.
7681
7682         3) On an eh_return path we make extra stack adjustments to update the
7683            canonical frame address to be the exception handler's CFA.  We want
7684            to authenticate using the CFA of the function which calls eh_return.
7685     */
7686   if (aarch64_return_address_signing_enabled ()
7687       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
7688     {
7689       switch (aarch64_ra_sign_key)
7690         {
7691           case AARCH64_KEY_A:
7692             insn = emit_insn (gen_autiasp ());
7693             break;
7694           case AARCH64_KEY_B:
7695             insn = emit_insn (gen_autibsp ());
7696             break;
7697           default:
7698             gcc_unreachable ();
7699         }
7700       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
7701       RTX_FRAME_RELATED_P (insn) = 1;
7702     }
7703
7704   /* Stack adjustment for exception handler.  */
7705   if (crtl->calls_eh_return && !for_sibcall)
7706     {
7707       /* We need to unwind the stack by the offset computed by
7708          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
7709          to be SP; letting the CFA move during this adjustment
7710          is just as correct as retaining the CFA from the body
7711          of the function.  Therefore, do nothing special.  */
7712       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
7713     }
7714
7715   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
7716   if (!for_sibcall)
7717     emit_jump_insn (ret_rtx);
7718 }
7719
7720 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
7721    normally or return to a previous frame after unwinding.
7722
7723    An EH return uses a single shared return sequence.  The epilogue is
7724    exactly like a normal epilogue except that it has an extra input
7725    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
7726    that must be applied after the frame has been destroyed.  An extra label
7727    is inserted before the epilogue which initializes this register to zero,
7728    and this is the entry point for a normal return.
7729
7730    An actual EH return updates the return address, initializes the stack
7731    adjustment and jumps directly into the epilogue (bypassing the zeroing
7732    of the adjustment).  Since the return address is typically saved on the
7733    stack when a function makes a call, the saved LR must be updated outside
7734    the epilogue.
7735
7736    This poses problems as the store is generated well before the epilogue,
7737    so the offset of LR is not known yet.  Also optimizations will remove the
7738    store as it appears dead, even after the epilogue is generated (as the
7739    base or offset for loading LR is different in many cases).
7740
7741    To avoid these problems this implementation forces the frame pointer
7742    in eh_return functions so that the location of LR is fixed and known early.
7743    It also marks the store volatile, so no optimization is permitted to
7744    remove the store.  */
7745 rtx
7746 aarch64_eh_return_handler_rtx (void)
7747 {
7748   rtx tmp = gen_frame_mem (Pmode,
7749     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
7750
7751   /* Mark the store volatile, so no optimization is permitted to remove it.  */
7752   MEM_VOLATILE_P (tmp) = true;
7753   return tmp;
7754 }
7755
7756 /* Output code to add DELTA to the first argument, and then jump
7757    to FUNCTION.  Used for C++ multiple inheritance.  */
7758 static void
7759 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
7760                          HOST_WIDE_INT delta,
7761                          HOST_WIDE_INT vcall_offset,
7762                          tree function)
7763 {
7764   /* The this pointer is always in x0.  Note that this differs from
7765      Arm where the this pointer maybe bumped to r1 if r0 is required
7766      to return a pointer to an aggregate.  On AArch64 a result value
7767      pointer will be in x8.  */
7768   int this_regno = R0_REGNUM;
7769   rtx this_rtx, temp0, temp1, addr, funexp;
7770   rtx_insn *insn;
7771   const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
7772
7773   if (aarch64_bti_enabled ())
7774     emit_insn (gen_bti_c());
7775
7776   reload_completed = 1;
7777   emit_note (NOTE_INSN_PROLOGUE_END);
7778
7779   this_rtx = gen_rtx_REG (Pmode, this_regno);
7780   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
7781   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
7782
7783   if (vcall_offset == 0)
7784     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
7785   else
7786     {
7787       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
7788
7789       addr = this_rtx;
7790       if (delta != 0)
7791         {
7792           if (delta >= -256 && delta < 256)
7793             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
7794                                        plus_constant (Pmode, this_rtx, delta));
7795           else
7796             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
7797                                 temp1, temp0, false);
7798         }
7799
7800       if (Pmode == ptr_mode)
7801         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
7802       else
7803         aarch64_emit_move (temp0,
7804                            gen_rtx_ZERO_EXTEND (Pmode,
7805                                                 gen_rtx_MEM (ptr_mode, addr)));
7806
7807       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
7808           addr = plus_constant (Pmode, temp0, vcall_offset);
7809       else
7810         {
7811           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
7812                                           Pmode);
7813           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
7814         }
7815
7816       if (Pmode == ptr_mode)
7817         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
7818       else
7819         aarch64_emit_move (temp1,
7820                            gen_rtx_SIGN_EXTEND (Pmode,
7821                                                 gen_rtx_MEM (ptr_mode, addr)));
7822
7823       emit_insn (gen_add2_insn (this_rtx, temp1));
7824     }
7825
7826   /* Generate a tail call to the target function.  */
7827   if (!TREE_USED (function))
7828     {
7829       assemble_external (function);
7830       TREE_USED (function) = 1;
7831     }
7832   funexp = XEXP (DECL_RTL (function), 0);
7833   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
7834   rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode);
7835   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
7836   SIBLING_CALL_P (insn) = 1;
7837
7838   insn = get_insns ();
7839   shorten_branches (insn);
7840
7841   assemble_start_function (thunk, fnname);
7842   final_start_function (insn, file, 1);
7843   final (insn, file, 1);
7844   final_end_function ();
7845   assemble_end_function (thunk, fnname);
7846
7847   /* Stop pretending to be a post-reload pass.  */
7848   reload_completed = 0;
7849 }
7850
7851 static bool
7852 aarch64_tls_referenced_p (rtx x)
7853 {
7854   if (!TARGET_HAVE_TLS)
7855     return false;
7856   subrtx_iterator::array_type array;
7857   FOR_EACH_SUBRTX (iter, array, x, ALL)
7858     {
7859       const_rtx x = *iter;
7860       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
7861         return true;
7862       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
7863          TLS offsets, not real symbol references.  */
7864       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7865         iter.skip_subrtxes ();
7866     }
7867   return false;
7868 }
7869
7870
7871 /* Return true if val can be encoded as a 12-bit unsigned immediate with
7872    a left shift of 0 or 12 bits.  */
7873 bool
7874 aarch64_uimm12_shift (HOST_WIDE_INT val)
7875 {
7876   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
7877           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
7878           );
7879 }
7880
7881 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
7882    that can be created with a left shift of 0 or 12.  */
7883 static HOST_WIDE_INT
7884 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
7885 {
7886   /* Check to see if the value fits in 24 bits, as that is the maximum we can
7887      handle correctly.  */
7888   gcc_assert ((val & 0xffffff) == val);
7889
7890   if (((val & 0xfff) << 0) == val)
7891     return val;
7892
7893   return val & (0xfff << 12);
7894 }
7895
7896 /* Return true if val is an immediate that can be loaded into a
7897    register by a MOVZ instruction.  */
7898 static bool
7899 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
7900 {
7901   if (GET_MODE_SIZE (mode) > 4)
7902     {
7903       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
7904           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
7905         return 1;
7906     }
7907   else
7908     {
7909       /* Ignore sign extension.  */
7910       val &= (HOST_WIDE_INT) 0xffffffff;
7911     }
7912   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
7913           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
7914 }
7915
7916 /* Test whether:
7917
7918      X = (X & AND_VAL) | IOR_VAL;
7919
7920    can be implemented using:
7921
7922      MOVK X, #(IOR_VAL >> shift), LSL #shift
7923
7924    Return the shift if so, otherwise return -1.  */
7925 int
7926 aarch64_movk_shift (const wide_int_ref &and_val,
7927                     const wide_int_ref &ior_val)
7928 {
7929   unsigned int precision = and_val.get_precision ();
7930   unsigned HOST_WIDE_INT mask = 0xffff;
7931   for (unsigned int shift = 0; shift < precision; shift += 16)
7932     {
7933       if (and_val == ~mask && (ior_val & mask) == ior_val)
7934         return shift;
7935       mask <<= 16;
7936     }
7937   return -1;
7938 }
7939
7940 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
7941    64-bit (DImode) integer.  */
7942
7943 static unsigned HOST_WIDE_INT
7944 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
7945 {
7946   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
7947   while (size < 64)
7948     {
7949       val &= (HOST_WIDE_INT_1U << size) - 1;
7950       val |= val << size;
7951       size *= 2;
7952     }
7953   return val;
7954 }
7955
7956 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
7957
7958 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
7959   {
7960     0x0000000100000001ull,
7961     0x0001000100010001ull,
7962     0x0101010101010101ull,
7963     0x1111111111111111ull,
7964     0x5555555555555555ull,
7965   };
7966
7967
7968 /* Return true if val is a valid bitmask immediate.  */
7969
7970 bool
7971 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
7972 {
7973   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
7974   int bits;
7975
7976   /* Check for a single sequence of one bits and return quickly if so.
7977      The special cases of all ones and all zeroes returns false.  */
7978   val = aarch64_replicate_bitmask_imm (val_in, mode);
7979   tmp = val + (val & -val);
7980
7981   if (tmp == (tmp & -tmp))
7982     return (val + 1) > 1;
7983
7984   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
7985   if (mode == SImode)
7986     val = (val << 32) | (val & 0xffffffff);
7987
7988   /* Invert if the immediate doesn't start with a zero bit - this means we
7989      only need to search for sequences of one bits.  */
7990   if (val & 1)
7991     val = ~val;
7992
7993   /* Find the first set bit and set tmp to val with the first sequence of one
7994      bits removed.  Return success if there is a single sequence of ones.  */
7995   first_one = val & -val;
7996   tmp = val & (val + first_one);
7997
7998   if (tmp == 0)
7999     return true;
8000
8001   /* Find the next set bit and compute the difference in bit position.  */
8002   next_one = tmp & -tmp;
8003   bits = clz_hwi (first_one) - clz_hwi (next_one);
8004   mask = val ^ tmp;
8005
8006   /* Check the bit position difference is a power of 2, and that the first
8007      sequence of one bits fits within 'bits' bits.  */
8008   if ((mask >> bits) != 0 || bits != (bits & -bits))
8009     return false;
8010
8011   /* Check the sequence of one bits is repeated 64/bits times.  */
8012   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
8013 }
8014
8015 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
8016    Assumed precondition: VAL_IN Is not zero.  */
8017
8018 unsigned HOST_WIDE_INT
8019 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
8020 {
8021   int lowest_bit_set = ctz_hwi (val_in);
8022   int highest_bit_set = floor_log2 (val_in);
8023   gcc_assert (val_in != 0);
8024
8025   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
8026           (HOST_WIDE_INT_1U << lowest_bit_set));
8027 }
8028
8029 /* Create constant where bits outside of lowest bit set to highest bit set
8030    are set to 1.  */
8031
8032 unsigned HOST_WIDE_INT
8033 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
8034 {
8035   return val_in | ~aarch64_and_split_imm1 (val_in);
8036 }
8037
8038 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
8039
8040 bool
8041 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
8042 {
8043   scalar_int_mode int_mode;
8044   if (!is_a <scalar_int_mode> (mode, &int_mode))
8045     return false;
8046
8047   if (aarch64_bitmask_imm (val_in, int_mode))
8048     return false;
8049
8050   if (aarch64_move_imm (val_in, int_mode))
8051     return false;
8052
8053   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
8054
8055   return aarch64_bitmask_imm (imm2, int_mode);
8056 }
8057
8058 /* Return true if val is an immediate that can be loaded into a
8059    register in a single instruction.  */
8060 bool
8061 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
8062 {
8063   scalar_int_mode int_mode;
8064   if (!is_a <scalar_int_mode> (mode, &int_mode))
8065     return false;
8066
8067   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
8068     return 1;
8069   return aarch64_bitmask_imm (val, int_mode);
8070 }
8071
8072 static bool
8073 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
8074 {
8075   rtx base, offset;
8076
8077   if (GET_CODE (x) == HIGH)
8078     return true;
8079
8080   /* There's no way to calculate VL-based values using relocations.  */
8081   subrtx_iterator::array_type array;
8082   FOR_EACH_SUBRTX (iter, array, x, ALL)
8083     if (GET_CODE (*iter) == CONST_POLY_INT)
8084       return true;
8085
8086   split_const (x, &base, &offset);
8087   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
8088     {
8089       if (aarch64_classify_symbol (base, INTVAL (offset))
8090           != SYMBOL_FORCE_TO_MEM)
8091         return true;
8092       else
8093         /* Avoid generating a 64-bit relocation in ILP32; leave
8094            to aarch64_expand_mov_immediate to handle it properly.  */
8095         return mode != ptr_mode;
8096     }
8097
8098   return aarch64_tls_referenced_p (x);
8099 }
8100
8101 /* Implement TARGET_CASE_VALUES_THRESHOLD.
8102    The expansion for a table switch is quite expensive due to the number
8103    of instructions, the table lookup and hard to predict indirect jump.
8104    When optimizing for speed, and -O3 enabled, use the per-core tuning if
8105    set, otherwise use tables for > 16 cases as a tradeoff between size and
8106    performance.  When optimizing for size, use the default setting.  */
8107
8108 static unsigned int
8109 aarch64_case_values_threshold (void)
8110 {
8111   /* Use the specified limit for the number of cases before using jump
8112      tables at higher optimization levels.  */
8113   if (optimize > 2
8114       && selected_cpu->tune->max_case_values != 0)
8115     return selected_cpu->tune->max_case_values;
8116   else
8117     return optimize_size ? default_case_values_threshold () : 17;
8118 }
8119
8120 /* Return true if register REGNO is a valid index register.
8121    STRICT_P is true if REG_OK_STRICT is in effect.  */
8122
8123 bool
8124 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
8125 {
8126   if (!HARD_REGISTER_NUM_P (regno))
8127     {
8128       if (!strict_p)
8129         return true;
8130
8131       if (!reg_renumber)
8132         return false;
8133
8134       regno = reg_renumber[regno];
8135     }
8136   return GP_REGNUM_P (regno);
8137 }
8138
8139 /* Return true if register REGNO is a valid base register for mode MODE.
8140    STRICT_P is true if REG_OK_STRICT is in effect.  */
8141
8142 bool
8143 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
8144 {
8145   if (!HARD_REGISTER_NUM_P (regno))
8146     {
8147       if (!strict_p)
8148         return true;
8149
8150       if (!reg_renumber)
8151         return false;
8152
8153       regno = reg_renumber[regno];
8154     }
8155
8156   /* The fake registers will be eliminated to either the stack or
8157      hard frame pointer, both of which are usually valid base registers.
8158      Reload deals with the cases where the eliminated form isn't valid.  */
8159   return (GP_REGNUM_P (regno)
8160           || regno == SP_REGNUM
8161           || regno == FRAME_POINTER_REGNUM
8162           || regno == ARG_POINTER_REGNUM);
8163 }
8164
8165 /* Return true if X is a valid base register for mode MODE.
8166    STRICT_P is true if REG_OK_STRICT is in effect.  */
8167
8168 static bool
8169 aarch64_base_register_rtx_p (rtx x, bool strict_p)
8170 {
8171   if (!strict_p
8172       && GET_CODE (x) == SUBREG
8173       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
8174     x = SUBREG_REG (x);
8175
8176   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
8177 }
8178
8179 /* Return true if address offset is a valid index.  If it is, fill in INFO
8180    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
8181
8182 static bool
8183 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
8184                         machine_mode mode, bool strict_p)
8185 {
8186   enum aarch64_address_type type;
8187   rtx index;
8188   int shift;
8189
8190   /* (reg:P) */
8191   if ((REG_P (x) || GET_CODE (x) == SUBREG)
8192       && GET_MODE (x) == Pmode)
8193     {
8194       type = ADDRESS_REG_REG;
8195       index = x;
8196       shift = 0;
8197     }
8198   /* (sign_extend:DI (reg:SI)) */
8199   else if ((GET_CODE (x) == SIGN_EXTEND
8200             || GET_CODE (x) == ZERO_EXTEND)
8201            && GET_MODE (x) == DImode
8202            && GET_MODE (XEXP (x, 0)) == SImode)
8203     {
8204       type = (GET_CODE (x) == SIGN_EXTEND)
8205         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8206       index = XEXP (x, 0);
8207       shift = 0;
8208     }
8209   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
8210   else if (GET_CODE (x) == MULT
8211            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
8212                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
8213            && GET_MODE (XEXP (x, 0)) == DImode
8214            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
8215            && CONST_INT_P (XEXP (x, 1)))
8216     {
8217       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
8218         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8219       index = XEXP (XEXP (x, 0), 0);
8220       shift = exact_log2 (INTVAL (XEXP (x, 1)));
8221     }
8222   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
8223   else if (GET_CODE (x) == ASHIFT
8224            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
8225                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
8226            && GET_MODE (XEXP (x, 0)) == DImode
8227            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
8228            && CONST_INT_P (XEXP (x, 1)))
8229     {
8230       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
8231         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8232       index = XEXP (XEXP (x, 0), 0);
8233       shift = INTVAL (XEXP (x, 1));
8234     }
8235   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
8236   else if ((GET_CODE (x) == SIGN_EXTRACT
8237             || GET_CODE (x) == ZERO_EXTRACT)
8238            && GET_MODE (x) == DImode
8239            && GET_CODE (XEXP (x, 0)) == MULT
8240            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8241            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
8242     {
8243       type = (GET_CODE (x) == SIGN_EXTRACT)
8244         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8245       index = XEXP (XEXP (x, 0), 0);
8246       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
8247       if (INTVAL (XEXP (x, 1)) != 32 + shift
8248           || INTVAL (XEXP (x, 2)) != 0)
8249         shift = -1;
8250     }
8251   /* (and:DI (mult:DI (reg:DI) (const_int scale))
8252      (const_int 0xffffffff<<shift)) */
8253   else if (GET_CODE (x) == AND
8254            && GET_MODE (x) == DImode
8255            && GET_CODE (XEXP (x, 0)) == MULT
8256            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8257            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8258            && CONST_INT_P (XEXP (x, 1)))
8259     {
8260       type = ADDRESS_REG_UXTW;
8261       index = XEXP (XEXP (x, 0), 0);
8262       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
8263       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
8264         shift = -1;
8265     }
8266   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
8267   else if ((GET_CODE (x) == SIGN_EXTRACT
8268             || GET_CODE (x) == ZERO_EXTRACT)
8269            && GET_MODE (x) == DImode
8270            && GET_CODE (XEXP (x, 0)) == ASHIFT
8271            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8272            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
8273     {
8274       type = (GET_CODE (x) == SIGN_EXTRACT)
8275         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8276       index = XEXP (XEXP (x, 0), 0);
8277       shift = INTVAL (XEXP (XEXP (x, 0), 1));
8278       if (INTVAL (XEXP (x, 1)) != 32 + shift
8279           || INTVAL (XEXP (x, 2)) != 0)
8280         shift = -1;
8281     }
8282   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
8283      (const_int 0xffffffff<<shift)) */
8284   else if (GET_CODE (x) == AND
8285            && GET_MODE (x) == DImode
8286            && GET_CODE (XEXP (x, 0)) == ASHIFT
8287            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8288            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8289            && CONST_INT_P (XEXP (x, 1)))
8290     {
8291       type = ADDRESS_REG_UXTW;
8292       index = XEXP (XEXP (x, 0), 0);
8293       shift = INTVAL (XEXP (XEXP (x, 0), 1));
8294       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
8295         shift = -1;
8296     }
8297   /* (mult:P (reg:P) (const_int scale)) */
8298   else if (GET_CODE (x) == MULT
8299            && GET_MODE (x) == Pmode
8300            && GET_MODE (XEXP (x, 0)) == Pmode
8301            && CONST_INT_P (XEXP (x, 1)))
8302     {
8303       type = ADDRESS_REG_REG;
8304       index = XEXP (x, 0);
8305       shift = exact_log2 (INTVAL (XEXP (x, 1)));
8306     }
8307   /* (ashift:P (reg:P) (const_int shift)) */
8308   else if (GET_CODE (x) == ASHIFT
8309            && GET_MODE (x) == Pmode
8310            && GET_MODE (XEXP (x, 0)) == Pmode
8311            && CONST_INT_P (XEXP (x, 1)))
8312     {
8313       type = ADDRESS_REG_REG;
8314       index = XEXP (x, 0);
8315       shift = INTVAL (XEXP (x, 1));
8316     }
8317   else
8318     return false;
8319
8320   if (!strict_p
8321       && GET_CODE (index) == SUBREG
8322       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
8323     index = SUBREG_REG (index);
8324
8325   if (aarch64_sve_data_mode_p (mode))
8326     {
8327       if (type != ADDRESS_REG_REG
8328           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
8329         return false;
8330     }
8331   else
8332     {
8333       if (shift != 0
8334           && !(IN_RANGE (shift, 1, 3)
8335                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
8336         return false;
8337     }
8338
8339   if (REG_P (index)
8340       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
8341     {
8342       info->type = type;
8343       info->offset = index;
8344       info->shift = shift;
8345       return true;
8346     }
8347
8348   return false;
8349 }
8350
8351 /* Return true if MODE is one of the modes for which we
8352    support LDP/STP operations.  */
8353
8354 static bool
8355 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
8356 {
8357   return mode == SImode || mode == DImode
8358          || mode == SFmode || mode == DFmode
8359          || (aarch64_vector_mode_supported_p (mode)
8360              && (known_eq (GET_MODE_SIZE (mode), 8)
8361                  || (known_eq (GET_MODE_SIZE (mode), 16)
8362                     && (aarch64_tune_params.extra_tuning_flags
8363                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
8364 }
8365
8366 /* Return true if REGNO is a virtual pointer register, or an eliminable
8367    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
8368    include stack_pointer or hard_frame_pointer.  */
8369 static bool
8370 virt_or_elim_regno_p (unsigned regno)
8371 {
8372   return ((regno >= FIRST_VIRTUAL_REGISTER
8373            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
8374           || regno == FRAME_POINTER_REGNUM
8375           || regno == ARG_POINTER_REGNUM);
8376 }
8377
8378 /* Return true if X is a valid address of type TYPE for machine mode MODE.
8379    If it is, fill in INFO appropriately.  STRICT_P is true if
8380    REG_OK_STRICT is in effect.  */
8381
8382 bool
8383 aarch64_classify_address (struct aarch64_address_info *info,
8384                           rtx x, machine_mode mode, bool strict_p,
8385                           aarch64_addr_query_type type)
8386 {
8387   enum rtx_code code = GET_CODE (x);
8388   rtx op0, op1;
8389   poly_int64 offset;
8390
8391   HOST_WIDE_INT const_size;
8392
8393   /* Whether a vector mode is partial doesn't affect address legitimacy.
8394      Partial vectors like VNx8QImode allow the same indexed addressing
8395      mode and MUL VL addressing mode as full vectors like VNx16QImode;
8396      in both cases, MUL VL counts multiples of GET_MODE_SIZE.  */
8397   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
8398   vec_flags &= ~VEC_PARTIAL;
8399
8400   /* On BE, we use load/store pair for all large int mode load/stores.
8401      TI/TFmode may also use a load/store pair.  */
8402   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
8403   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
8404                             || type == ADDR_QUERY_LDP_STP_N
8405                             || mode == TImode
8406                             || mode == TFmode
8407                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
8408
8409   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
8410      corresponds to the actual size of the memory being loaded/stored and the
8411      mode of the corresponding addressing mode is half of that.  */
8412   if (type == ADDR_QUERY_LDP_STP_N
8413       && known_eq (GET_MODE_SIZE (mode), 16))
8414     mode = DFmode;
8415
8416   bool allow_reg_index_p = (!load_store_pair_p
8417                             && (known_lt (GET_MODE_SIZE (mode), 16)
8418                                 || vec_flags == VEC_ADVSIMD
8419                                 || vec_flags & VEC_SVE_DATA));
8420
8421   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
8422      [Rn, #offset, MUL VL].  */
8423   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
8424       && (code != REG && code != PLUS))
8425     return false;
8426
8427   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
8428      REG addressing.  */
8429   if (advsimd_struct_p
8430       && !BYTES_BIG_ENDIAN
8431       && (code != POST_INC && code != REG))
8432     return false;
8433
8434   gcc_checking_assert (GET_MODE (x) == VOIDmode
8435                        || SCALAR_INT_MODE_P (GET_MODE (x)));
8436
8437   switch (code)
8438     {
8439     case REG:
8440     case SUBREG:
8441       info->type = ADDRESS_REG_IMM;
8442       info->base = x;
8443       info->offset = const0_rtx;
8444       info->const_offset = 0;
8445       return aarch64_base_register_rtx_p (x, strict_p);
8446
8447     case PLUS:
8448       op0 = XEXP (x, 0);
8449       op1 = XEXP (x, 1);
8450
8451       if (! strict_p
8452           && REG_P (op0)
8453           && virt_or_elim_regno_p (REGNO (op0))
8454           && poly_int_rtx_p (op1, &offset))
8455         {
8456           info->type = ADDRESS_REG_IMM;
8457           info->base = op0;
8458           info->offset = op1;
8459           info->const_offset = offset;
8460
8461           return true;
8462         }
8463
8464       if (maybe_ne (GET_MODE_SIZE (mode), 0)
8465           && aarch64_base_register_rtx_p (op0, strict_p)
8466           && poly_int_rtx_p (op1, &offset))
8467         {
8468           info->type = ADDRESS_REG_IMM;
8469           info->base = op0;
8470           info->offset = op1;
8471           info->const_offset = offset;
8472
8473           /* TImode and TFmode values are allowed in both pairs of X
8474              registers and individual Q registers.  The available
8475              address modes are:
8476              X,X: 7-bit signed scaled offset
8477              Q:   9-bit signed offset
8478              We conservatively require an offset representable in either mode.
8479              When performing the check for pairs of X registers i.e.  LDP/STP
8480              pass down DImode since that is the natural size of the LDP/STP
8481              instruction memory accesses.  */
8482           if (mode == TImode || mode == TFmode)
8483             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
8484                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
8485                         || offset_12bit_unsigned_scaled_p (mode, offset)));
8486
8487           /* A 7bit offset check because OImode will emit a ldp/stp
8488              instruction (only big endian will get here).
8489              For ldp/stp instructions, the offset is scaled for the size of a
8490              single element of the pair.  */
8491           if (mode == OImode)
8492             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
8493
8494           /* Three 9/12 bit offsets checks because CImode will emit three
8495              ldr/str instructions (only big endian will get here).  */
8496           if (mode == CImode)
8497             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
8498                     && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
8499                                                                offset + 32)
8500                         || offset_12bit_unsigned_scaled_p (V16QImode,
8501                                                            offset + 32)));
8502
8503           /* Two 7bit offsets checks because XImode will emit two ldp/stp
8504              instructions (only big endian will get here).  */
8505           if (mode == XImode)
8506             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
8507                     && aarch64_offset_7bit_signed_scaled_p (TImode,
8508                                                             offset + 32));
8509
8510           /* Make "m" use the LD1 offset range for SVE data modes, so
8511              that pre-RTL optimizers like ivopts will work to that
8512              instead of the wider LDR/STR range.  */
8513           if (vec_flags == VEC_SVE_DATA)
8514             return (type == ADDR_QUERY_M
8515                     ? offset_4bit_signed_scaled_p (mode, offset)
8516                     : offset_9bit_signed_scaled_p (mode, offset));
8517
8518           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
8519             {
8520               poly_int64 end_offset = (offset
8521                                        + GET_MODE_SIZE (mode)
8522                                        - BYTES_PER_SVE_VECTOR);
8523               return (type == ADDR_QUERY_M
8524                       ? offset_4bit_signed_scaled_p (mode, offset)
8525                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
8526                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
8527                                                          end_offset)));
8528             }
8529
8530           if (vec_flags == VEC_SVE_PRED)
8531             return offset_9bit_signed_scaled_p (mode, offset);
8532
8533           if (load_store_pair_p)
8534             return ((known_eq (GET_MODE_SIZE (mode), 4)
8535                      || known_eq (GET_MODE_SIZE (mode), 8)
8536                      || known_eq (GET_MODE_SIZE (mode), 16))
8537                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
8538           else
8539             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
8540                     || offset_12bit_unsigned_scaled_p (mode, offset));
8541         }
8542
8543       if (allow_reg_index_p)
8544         {
8545           /* Look for base + (scaled/extended) index register.  */
8546           if (aarch64_base_register_rtx_p (op0, strict_p)
8547               && aarch64_classify_index (info, op1, mode, strict_p))
8548             {
8549               info->base = op0;
8550               return true;
8551             }
8552           if (aarch64_base_register_rtx_p (op1, strict_p)
8553               && aarch64_classify_index (info, op0, mode, strict_p))
8554             {
8555               info->base = op1;
8556               return true;
8557             }
8558         }
8559
8560       return false;
8561
8562     case POST_INC:
8563     case POST_DEC:
8564     case PRE_INC:
8565     case PRE_DEC:
8566       info->type = ADDRESS_REG_WB;
8567       info->base = XEXP (x, 0);
8568       info->offset = NULL_RTX;
8569       return aarch64_base_register_rtx_p (info->base, strict_p);
8570
8571     case POST_MODIFY:
8572     case PRE_MODIFY:
8573       info->type = ADDRESS_REG_WB;
8574       info->base = XEXP (x, 0);
8575       if (GET_CODE (XEXP (x, 1)) == PLUS
8576           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
8577           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
8578           && aarch64_base_register_rtx_p (info->base, strict_p))
8579         {
8580           info->offset = XEXP (XEXP (x, 1), 1);
8581           info->const_offset = offset;
8582
8583           /* TImode and TFmode values are allowed in both pairs of X
8584              registers and individual Q registers.  The available
8585              address modes are:
8586              X,X: 7-bit signed scaled offset
8587              Q:   9-bit signed offset
8588              We conservatively require an offset representable in either mode.
8589            */
8590           if (mode == TImode || mode == TFmode)
8591             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
8592                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
8593
8594           if (load_store_pair_p)
8595             return ((known_eq (GET_MODE_SIZE (mode), 4)
8596                      || known_eq (GET_MODE_SIZE (mode), 8)
8597                      || known_eq (GET_MODE_SIZE (mode), 16))
8598                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
8599           else
8600             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
8601         }
8602       return false;
8603
8604     case CONST:
8605     case SYMBOL_REF:
8606     case LABEL_REF:
8607       /* load literal: pc-relative constant pool entry.  Only supported
8608          for SI mode or larger.  */
8609       info->type = ADDRESS_SYMBOLIC;
8610
8611       if (!load_store_pair_p
8612           && GET_MODE_SIZE (mode).is_constant (&const_size)
8613           && const_size >= 4)
8614         {
8615           rtx sym, addend;
8616
8617           split_const (x, &sym, &addend);
8618           return ((GET_CODE (sym) == LABEL_REF
8619                    || (GET_CODE (sym) == SYMBOL_REF
8620                        && CONSTANT_POOL_ADDRESS_P (sym)
8621                        && aarch64_pcrelative_literal_loads)));
8622         }
8623       return false;
8624
8625     case LO_SUM:
8626       info->type = ADDRESS_LO_SUM;
8627       info->base = XEXP (x, 0);
8628       info->offset = XEXP (x, 1);
8629       if (allow_reg_index_p
8630           && aarch64_base_register_rtx_p (info->base, strict_p))
8631         {
8632           rtx sym, offs;
8633           split_const (info->offset, &sym, &offs);
8634           if (GET_CODE (sym) == SYMBOL_REF
8635               && (aarch64_classify_symbol (sym, INTVAL (offs))
8636                   == SYMBOL_SMALL_ABSOLUTE))
8637             {
8638               /* The symbol and offset must be aligned to the access size.  */
8639               unsigned int align;
8640
8641               if (CONSTANT_POOL_ADDRESS_P (sym))
8642                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
8643               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
8644                 {
8645                   tree exp = SYMBOL_REF_DECL (sym);
8646                   align = TYPE_ALIGN (TREE_TYPE (exp));
8647                   align = aarch64_constant_alignment (exp, align);
8648                 }
8649               else if (SYMBOL_REF_DECL (sym))
8650                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
8651               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
8652                        && SYMBOL_REF_BLOCK (sym) != NULL)
8653                 align = SYMBOL_REF_BLOCK (sym)->alignment;
8654               else
8655                 align = BITS_PER_UNIT;
8656
8657               poly_int64 ref_size = GET_MODE_SIZE (mode);
8658               if (known_eq (ref_size, 0))
8659                 ref_size = GET_MODE_SIZE (DImode);
8660
8661               return (multiple_p (INTVAL (offs), ref_size)
8662                       && multiple_p (align / BITS_PER_UNIT, ref_size));
8663             }
8664         }
8665       return false;
8666
8667     default:
8668       return false;
8669     }
8670 }
8671
8672 /* Return true if the address X is valid for a PRFM instruction.
8673    STRICT_P is true if we should do strict checking with
8674    aarch64_classify_address.  */
8675
8676 bool
8677 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
8678 {
8679   struct aarch64_address_info addr;
8680
8681   /* PRFM accepts the same addresses as DImode...  */
8682   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
8683   if (!res)
8684     return false;
8685
8686   /* ... except writeback forms.  */
8687   return addr.type != ADDRESS_REG_WB;
8688 }
8689
8690 bool
8691 aarch64_symbolic_address_p (rtx x)
8692 {
8693   rtx offset;
8694
8695   split_const (x, &x, &offset);
8696   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
8697 }
8698
8699 /* Classify the base of symbolic expression X.  */
8700
8701 enum aarch64_symbol_type
8702 aarch64_classify_symbolic_expression (rtx x)
8703 {
8704   rtx offset;
8705
8706   split_const (x, &x, &offset);
8707   return aarch64_classify_symbol (x, INTVAL (offset));
8708 }
8709
8710
8711 /* Return TRUE if X is a legitimate address for accessing memory in
8712    mode MODE.  */
8713 static bool
8714 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
8715 {
8716   struct aarch64_address_info addr;
8717
8718   return aarch64_classify_address (&addr, x, mode, strict_p);
8719 }
8720
8721 /* Return TRUE if X is a legitimate address of type TYPE for accessing
8722    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
8723 bool
8724 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
8725                               aarch64_addr_query_type type)
8726 {
8727   struct aarch64_address_info addr;
8728
8729   return aarch64_classify_address (&addr, x, mode, strict_p, type);
8730 }
8731
8732 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
8733
8734 static bool
8735 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
8736                                          poly_int64 orig_offset,
8737                                          machine_mode mode)
8738 {
8739   HOST_WIDE_INT size;
8740   if (GET_MODE_SIZE (mode).is_constant (&size))
8741     {
8742       HOST_WIDE_INT const_offset, second_offset;
8743
8744       /* A general SVE offset is A * VQ + B.  Remove the A component from
8745          coefficient 0 in order to get the constant B.  */
8746       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
8747
8748       /* Split an out-of-range address displacement into a base and
8749          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
8750          range otherwise to increase opportunities for sharing the base
8751          address of different sizes.  Unaligned accesses use the signed
8752          9-bit range, TImode/TFmode use the intersection of signed
8753          scaled 7-bit and signed 9-bit offset.  */
8754       if (mode == TImode || mode == TFmode)
8755         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
8756       else if ((const_offset & (size - 1)) != 0)
8757         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
8758       else
8759         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
8760
8761       if (second_offset == 0 || known_eq (orig_offset, second_offset))
8762         return false;
8763
8764       /* Split the offset into second_offset and the rest.  */
8765       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
8766       *offset2 = gen_int_mode (second_offset, Pmode);
8767       return true;
8768     }
8769   else
8770     {
8771       /* Get the mode we should use as the basis of the range.  For structure
8772          modes this is the mode of one vector.  */
8773       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
8774       machine_mode step_mode
8775         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
8776
8777       /* Get the "mul vl" multiplier we'd like to use.  */
8778       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
8779       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
8780       if (vec_flags & VEC_SVE_DATA)
8781         /* LDR supports a 9-bit range, but the move patterns for
8782            structure modes require all vectors to be in range of the
8783            same base.  The simplest way of accomodating that while still
8784            promoting reuse of anchor points between different modes is
8785            to use an 8-bit range unconditionally.  */
8786         vnum = ((vnum + 128) & 255) - 128;
8787       else
8788         /* Predicates are only handled singly, so we might as well use
8789            the full range.  */
8790         vnum = ((vnum + 256) & 511) - 256;
8791       if (vnum == 0)
8792         return false;
8793
8794       /* Convert the "mul vl" multiplier into a byte offset.  */
8795       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
8796       if (known_eq (second_offset, orig_offset))
8797         return false;
8798
8799       /* Split the offset into second_offset and the rest.  */
8800       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
8801       *offset2 = gen_int_mode (second_offset, Pmode);
8802       return true;
8803     }
8804 }
8805
8806 /* Return the binary representation of floating point constant VALUE in INTVAL.
8807    If the value cannot be converted, return false without setting INTVAL.
8808    The conversion is done in the given MODE.  */
8809 bool
8810 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
8811 {
8812
8813   /* We make a general exception for 0.  */
8814   if (aarch64_float_const_zero_rtx_p (value))
8815     {
8816       *intval = 0;
8817       return true;
8818     }
8819
8820   scalar_float_mode mode;
8821   if (GET_CODE (value) != CONST_DOUBLE
8822       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
8823       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
8824       /* Only support up to DF mode.  */
8825       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
8826     return false;
8827
8828   unsigned HOST_WIDE_INT ival = 0;
8829
8830   long res[2];
8831   real_to_target (res,
8832                   CONST_DOUBLE_REAL_VALUE (value),
8833                   REAL_MODE_FORMAT (mode));
8834
8835   if (mode == DFmode)
8836     {
8837       int order = BYTES_BIG_ENDIAN ? 1 : 0;
8838       ival = zext_hwi (res[order], 32);
8839       ival |= (zext_hwi (res[1 - order], 32) << 32);
8840     }
8841   else
8842       ival = zext_hwi (res[0], 32);
8843
8844   *intval = ival;
8845   return true;
8846 }
8847
8848 /* Return TRUE if rtx X is an immediate constant that can be moved using a
8849    single MOV(+MOVK) followed by an FMOV.  */
8850 bool
8851 aarch64_float_const_rtx_p (rtx x)
8852 {
8853   machine_mode mode = GET_MODE (x);
8854   if (mode == VOIDmode)
8855     return false;
8856
8857   /* Determine whether it's cheaper to write float constants as
8858      mov/movk pairs over ldr/adrp pairs.  */
8859   unsigned HOST_WIDE_INT ival;
8860
8861   if (GET_CODE (x) == CONST_DOUBLE
8862       && SCALAR_FLOAT_MODE_P (mode)
8863       && aarch64_reinterpret_float_as_int (x, &ival))
8864     {
8865       scalar_int_mode imode = (mode == HFmode
8866                                ? SImode
8867                                : int_mode_for_mode (mode).require ());
8868       int num_instr = aarch64_internal_mov_immediate
8869                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8870       return num_instr < 3;
8871     }
8872
8873   return false;
8874 }
8875
8876 /* Return TRUE if rtx X is immediate constant 0.0 */
8877 bool
8878 aarch64_float_const_zero_rtx_p (rtx x)
8879 {
8880   if (GET_MODE (x) == VOIDmode)
8881     return false;
8882
8883   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
8884     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
8885   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
8886 }
8887
8888 /* Return TRUE if rtx X is immediate constant that fits in a single
8889    MOVI immediate operation.  */
8890 bool
8891 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
8892 {
8893   if (!TARGET_SIMD)
8894      return false;
8895
8896   machine_mode vmode;
8897   scalar_int_mode imode;
8898   unsigned HOST_WIDE_INT ival;
8899
8900   if (GET_CODE (x) == CONST_DOUBLE
8901       && SCALAR_FLOAT_MODE_P (mode))
8902     {
8903       if (!aarch64_reinterpret_float_as_int (x, &ival))
8904         return false;
8905
8906       /* We make a general exception for 0.  */
8907       if (aarch64_float_const_zero_rtx_p (x))
8908         return true;
8909
8910       imode = int_mode_for_mode (mode).require ();
8911     }
8912   else if (GET_CODE (x) == CONST_INT
8913            && is_a <scalar_int_mode> (mode, &imode))
8914     ival = INTVAL (x);
8915   else
8916     return false;
8917
8918    /* use a 64 bit mode for everything except for DI/DF mode, where we use
8919      a 128 bit vector mode.  */
8920   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
8921
8922   vmode = aarch64_simd_container_mode (imode, width);
8923   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
8924
8925   return aarch64_simd_valid_immediate (v_op, NULL);
8926 }
8927
8928
8929 /* Return the fixed registers used for condition codes.  */
8930
8931 static bool
8932 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
8933 {
8934   *p1 = CC_REGNUM;
8935   *p2 = INVALID_REGNUM;
8936   return true;
8937 }
8938
8939 /* This function is used by the call expanders of the machine description.
8940    RESULT is the register in which the result is returned.  It's NULL for
8941    "call" and "sibcall".
8942    MEM is the location of the function call.
8943    CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
8944    SIBCALL indicates whether this function call is normal call or sibling call.
8945    It will generate different pattern accordingly.  */
8946
8947 void
8948 aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
8949 {
8950   rtx call, callee, tmp;
8951   rtvec vec;
8952   machine_mode mode;
8953
8954   gcc_assert (MEM_P (mem));
8955   callee = XEXP (mem, 0);
8956   mode = GET_MODE (callee);
8957   gcc_assert (mode == Pmode);
8958
8959   /* Decide if we should generate indirect calls by loading the
8960      address of the callee into a register before performing
8961      the branch-and-link.  */
8962   if (SYMBOL_REF_P (callee)
8963       ? (aarch64_is_long_call_p (callee)
8964          || aarch64_is_noplt_call_p (callee))
8965       : !REG_P (callee))
8966     XEXP (mem, 0) = force_reg (mode, callee);
8967
8968   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
8969
8970   if (result != NULL_RTX)
8971     call = gen_rtx_SET (result, call);
8972
8973   if (sibcall)
8974     tmp = ret_rtx;
8975   else
8976     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
8977
8978   gcc_assert (CONST_INT_P (callee_abi));
8979   callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
8980                                UNSPEC_CALLEE_ABI);
8981
8982   vec = gen_rtvec (3, call, callee_abi, tmp);
8983   call = gen_rtx_PARALLEL (VOIDmode, vec);
8984
8985   aarch64_emit_call_insn (call);
8986 }
8987
8988 /* Emit call insn with PAT and do aarch64-specific handling.  */
8989
8990 void
8991 aarch64_emit_call_insn (rtx pat)
8992 {
8993   rtx insn = emit_call_insn (pat);
8994
8995   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
8996   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
8997   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
8998 }
8999
9000 machine_mode
9001 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
9002 {
9003   machine_mode mode_x = GET_MODE (x);
9004   rtx_code code_x = GET_CODE (x);
9005
9006   /* All floating point compares return CCFP if it is an equality
9007      comparison, and CCFPE otherwise.  */
9008   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
9009     {
9010       switch (code)
9011         {
9012         case EQ:
9013         case NE:
9014         case UNORDERED:
9015         case ORDERED:
9016         case UNLT:
9017         case UNLE:
9018         case UNGT:
9019         case UNGE:
9020         case UNEQ:
9021           return CCFPmode;
9022
9023         case LT:
9024         case LE:
9025         case GT:
9026         case GE:
9027         case LTGT:
9028           return CCFPEmode;
9029
9030         default:
9031           gcc_unreachable ();
9032         }
9033     }
9034
9035   /* Equality comparisons of short modes against zero can be performed
9036      using the TST instruction with the appropriate bitmask.  */
9037   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
9038       && (code == EQ || code == NE)
9039       && (mode_x == HImode || mode_x == QImode))
9040     return CC_NZmode;
9041
9042   /* Similarly, comparisons of zero_extends from shorter modes can
9043      be performed using an ANDS with an immediate mask.  */
9044   if (y == const0_rtx && code_x == ZERO_EXTEND
9045       && (mode_x == SImode || mode_x == DImode)
9046       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
9047       && (code == EQ || code == NE))
9048     return CC_NZmode;
9049
9050   if ((mode_x == SImode || mode_x == DImode)
9051       && y == const0_rtx
9052       && (code == EQ || code == NE || code == LT || code == GE)
9053       && (code_x == PLUS || code_x == MINUS || code_x == AND
9054           || code_x == NEG
9055           || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
9056               && CONST_INT_P (XEXP (x, 2)))))
9057     return CC_NZmode;
9058
9059   /* A compare with a shifted operand.  Because of canonicalization,
9060      the comparison will have to be swapped when we emit the assembly
9061      code.  */
9062   if ((mode_x == SImode || mode_x == DImode)
9063       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
9064       && (code_x == ASHIFT || code_x == ASHIFTRT
9065           || code_x == LSHIFTRT
9066           || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
9067     return CC_SWPmode;
9068
9069   /* Similarly for a negated operand, but we can only do this for
9070      equalities.  */
9071   if ((mode_x == SImode || mode_x == DImode)
9072       && (REG_P (y) || GET_CODE (y) == SUBREG)
9073       && (code == EQ || code == NE)
9074       && code_x == NEG)
9075     return CC_Zmode;
9076
9077   /* A test for unsigned overflow from an addition.  */
9078   if ((mode_x == DImode || mode_x == TImode)
9079       && (code == LTU || code == GEU)
9080       && code_x == PLUS
9081       && rtx_equal_p (XEXP (x, 0), y))
9082     return CC_Cmode;
9083
9084   /* A test for unsigned overflow from an add with carry.  */
9085   if ((mode_x == DImode || mode_x == TImode)
9086       && (code == LTU || code == GEU)
9087       && code_x == PLUS
9088       && CONST_SCALAR_INT_P (y)
9089       && (rtx_mode_t (y, mode_x)
9090           == (wi::shwi (1, mode_x)
9091               << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
9092     return CC_ADCmode;
9093
9094   /* A test for signed overflow.  */
9095   if ((mode_x == DImode || mode_x == TImode)
9096       && code == NE
9097       && code_x == PLUS
9098       && GET_CODE (y) == SIGN_EXTEND)
9099     return CC_Vmode;
9100
9101   /* For everything else, return CCmode.  */
9102   return CCmode;
9103 }
9104
9105 static int
9106 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
9107
9108 int
9109 aarch64_get_condition_code (rtx x)
9110 {
9111   machine_mode mode = GET_MODE (XEXP (x, 0));
9112   enum rtx_code comp_code = GET_CODE (x);
9113
9114   if (GET_MODE_CLASS (mode) != MODE_CC)
9115     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
9116   return aarch64_get_condition_code_1 (mode, comp_code);
9117 }
9118
9119 static int
9120 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
9121 {
9122   switch (mode)
9123     {
9124     case E_CCFPmode:
9125     case E_CCFPEmode:
9126       switch (comp_code)
9127         {
9128         case GE: return AARCH64_GE;
9129         case GT: return AARCH64_GT;
9130         case LE: return AARCH64_LS;
9131         case LT: return AARCH64_MI;
9132         case NE: return AARCH64_NE;
9133         case EQ: return AARCH64_EQ;
9134         case ORDERED: return AARCH64_VC;
9135         case UNORDERED: return AARCH64_VS;
9136         case UNLT: return AARCH64_LT;
9137         case UNLE: return AARCH64_LE;
9138         case UNGT: return AARCH64_HI;
9139         case UNGE: return AARCH64_PL;
9140         default: return -1;
9141         }
9142       break;
9143
9144     case E_CCmode:
9145       switch (comp_code)
9146         {
9147         case NE: return AARCH64_NE;
9148         case EQ: return AARCH64_EQ;
9149         case GE: return AARCH64_GE;
9150         case GT: return AARCH64_GT;
9151         case LE: return AARCH64_LE;
9152         case LT: return AARCH64_LT;
9153         case GEU: return AARCH64_CS;
9154         case GTU: return AARCH64_HI;
9155         case LEU: return AARCH64_LS;
9156         case LTU: return AARCH64_CC;
9157         default: return -1;
9158         }
9159       break;
9160
9161     case E_CC_SWPmode:
9162       switch (comp_code)
9163         {
9164         case NE: return AARCH64_NE;
9165         case EQ: return AARCH64_EQ;
9166         case GE: return AARCH64_LE;
9167         case GT: return AARCH64_LT;
9168         case LE: return AARCH64_GE;
9169         case LT: return AARCH64_GT;
9170         case GEU: return AARCH64_LS;
9171         case GTU: return AARCH64_CC;
9172         case LEU: return AARCH64_CS;
9173         case LTU: return AARCH64_HI;
9174         default: return -1;
9175         }
9176       break;
9177
9178     case E_CC_NZCmode:
9179       switch (comp_code)
9180         {
9181         case NE: return AARCH64_NE; /* = any */
9182         case EQ: return AARCH64_EQ; /* = none */
9183         case GE: return AARCH64_PL; /* = nfrst */
9184         case LT: return AARCH64_MI; /* = first */
9185         case GEU: return AARCH64_CS; /* = nlast */
9186         case GTU: return AARCH64_HI; /* = pmore */
9187         case LEU: return AARCH64_LS; /* = plast */
9188         case LTU: return AARCH64_CC; /* = last */
9189         default: return -1;
9190         }
9191       break;
9192
9193     case E_CC_NZmode:
9194       switch (comp_code)
9195         {
9196         case NE: return AARCH64_NE;
9197         case EQ: return AARCH64_EQ;
9198         case GE: return AARCH64_PL;
9199         case LT: return AARCH64_MI;
9200         default: return -1;
9201         }
9202       break;
9203
9204     case E_CC_Zmode:
9205       switch (comp_code)
9206         {
9207         case NE: return AARCH64_NE;
9208         case EQ: return AARCH64_EQ;
9209         default: return -1;
9210         }
9211       break;
9212
9213     case E_CC_Cmode:
9214       switch (comp_code)
9215         {
9216         case LTU: return AARCH64_CS;
9217         case GEU: return AARCH64_CC;
9218         default: return -1;
9219         }
9220       break;
9221
9222     case E_CC_ADCmode:
9223       switch (comp_code)
9224         {
9225         case GEU: return AARCH64_CS;
9226         case LTU: return AARCH64_CC;
9227         default: return -1;
9228         }
9229       break;
9230
9231     case E_CC_Vmode:
9232       switch (comp_code)
9233         {
9234         case NE: return AARCH64_VS;
9235         case EQ: return AARCH64_VC;
9236         default: return -1;
9237         }
9238       break;
9239
9240     default:
9241       return -1;
9242     }
9243
9244   return -1;
9245 }
9246
9247 bool
9248 aarch64_const_vec_all_same_in_range_p (rtx x,
9249                                        HOST_WIDE_INT minval,
9250                                        HOST_WIDE_INT maxval)
9251 {
9252   rtx elt;
9253   return (const_vec_duplicate_p (x, &elt)
9254           && CONST_INT_P (elt)
9255           && IN_RANGE (INTVAL (elt), minval, maxval));
9256 }
9257
9258 bool
9259 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
9260 {
9261   return aarch64_const_vec_all_same_in_range_p (x, val, val);
9262 }
9263
9264 /* Return true if VEC is a constant in which every element is in the range
9265    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
9266
9267 static bool
9268 aarch64_const_vec_all_in_range_p (rtx vec,
9269                                   HOST_WIDE_INT minval,
9270                                   HOST_WIDE_INT maxval)
9271 {
9272   if (GET_CODE (vec) != CONST_VECTOR
9273       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
9274     return false;
9275
9276   int nunits;
9277   if (!CONST_VECTOR_STEPPED_P (vec))
9278     nunits = const_vector_encoded_nelts (vec);
9279   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
9280     return false;
9281
9282   for (int i = 0; i < nunits; i++)
9283     {
9284       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
9285       if (!CONST_INT_P (vec_elem)
9286           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
9287         return false;
9288     }
9289   return true;
9290 }
9291
9292 /* N Z C V.  */
9293 #define AARCH64_CC_V 1
9294 #define AARCH64_CC_C (1 << 1)
9295 #define AARCH64_CC_Z (1 << 2)
9296 #define AARCH64_CC_N (1 << 3)
9297
9298 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
9299 static const int aarch64_nzcv_codes[] =
9300 {
9301   0,            /* EQ, Z == 1.  */
9302   AARCH64_CC_Z, /* NE, Z == 0.  */
9303   0,            /* CS, C == 1.  */
9304   AARCH64_CC_C, /* CC, C == 0.  */
9305   0,            /* MI, N == 1.  */
9306   AARCH64_CC_N, /* PL, N == 0.  */
9307   0,            /* VS, V == 1.  */
9308   AARCH64_CC_V, /* VC, V == 0.  */
9309   0,            /* HI, C ==1 && Z == 0.  */
9310   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
9311   AARCH64_CC_V, /* GE, N == V.  */
9312   0,            /* LT, N != V.  */
9313   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
9314   0,            /* LE, !(Z == 0 && N == V).  */
9315   0,            /* AL, Any.  */
9316   0             /* NV, Any.  */
9317 };
9318
9319 /* Print floating-point vector immediate operand X to F, negating it
9320    first if NEGATE is true.  Return true on success, false if it isn't
9321    a constant we can handle.  */
9322
9323 static bool
9324 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
9325 {
9326   rtx elt;
9327
9328   if (!const_vec_duplicate_p (x, &elt))
9329     return false;
9330
9331   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
9332   if (negate)
9333     r = real_value_negate (&r);
9334
9335   /* Handle the SVE single-bit immediates specially, since they have a
9336      fixed form in the assembly syntax.  */
9337   if (real_equal (&r, &dconst0))
9338     asm_fprintf (f, "0.0");
9339   else if (real_equal (&r, &dconst2))
9340     asm_fprintf (f, "2.0");
9341   else if (real_equal (&r, &dconst1))
9342     asm_fprintf (f, "1.0");
9343   else if (real_equal (&r, &dconsthalf))
9344     asm_fprintf (f, "0.5");
9345   else
9346     {
9347       const int buf_size = 20;
9348       char float_buf[buf_size] = {'\0'};
9349       real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
9350                                 1, GET_MODE (elt));
9351       asm_fprintf (f, "%s", float_buf);
9352     }
9353
9354   return true;
9355 }
9356
9357 /* Return the equivalent letter for size.  */
9358 static char
9359 sizetochar (int size)
9360 {
9361   switch (size)
9362     {
9363     case 64: return 'd';
9364     case 32: return 's';
9365     case 16: return 'h';
9366     case 8 : return 'b';
9367     default: gcc_unreachable ();
9368     }
9369 }
9370
9371 /* Print operand X to file F in a target specific manner according to CODE.
9372    The acceptable formatting commands given by CODE are:
9373      'c':               An integer or symbol address without a preceding #
9374                         sign.
9375      'C':               Take the duplicated element in a vector constant
9376                         and print it in hex.
9377      'D':               Take the duplicated element in a vector constant
9378                         and print it as an unsigned integer, in decimal.
9379      'e':               Print the sign/zero-extend size as a character 8->b,
9380                         16->h, 32->w.  Can also be used for masks:
9381                         0xff->b, 0xffff->h, 0xffffffff->w.
9382      'I':               If the operand is a duplicated vector constant,
9383                         replace it with the duplicated scalar.  If the
9384                         operand is then a floating-point constant, replace
9385                         it with the integer bit representation.  Print the
9386                         transformed constant as a signed decimal number.
9387      'p':               Prints N such that 2^N == X (X must be power of 2 and
9388                         const int).
9389      'P':               Print the number of non-zero bits in X (a const_int).
9390      'H':               Print the higher numbered register of a pair (TImode)
9391                         of regs.
9392      'm':               Print a condition (eq, ne, etc).
9393      'M':               Same as 'm', but invert condition.
9394      'N':               Take the duplicated element in a vector constant
9395                         and print the negative of it in decimal.
9396      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
9397      'S/T/U/V':         Print a FP/SIMD register name for a register list.
9398                         The register printed is the FP/SIMD register name
9399                         of X + 0/1/2/3 for S/T/U/V.
9400      'R':               Print a scalar Integer/FP/SIMD register name + 1.
9401      'X':               Print bottom 16 bits of integer constant in hex.
9402      'w/x':             Print a general register name or the zero register
9403                         (32-bit or 64-bit).
9404      '0':               Print a normal operand, if it's a general register,
9405                         then we assume DImode.
9406      'k':               Print NZCV for conditional compare instructions.
9407      'A':               Output address constant representing the first
9408                         argument of X, specifying a relocation offset
9409                         if appropriate.
9410      'L':               Output constant address specified by X
9411                         with a relocation offset if appropriate.
9412      'G':               Prints address of X, specifying a PC relative
9413                         relocation mode if appropriate.
9414      'y':               Output address of LDP or STP - this is used for
9415                         some LDP/STPs which don't use a PARALLEL in their
9416                         pattern (so the mode needs to be adjusted).
9417      'z':               Output address of a typical LDP or STP.  */
9418
9419 static void
9420 aarch64_print_operand (FILE *f, rtx x, int code)
9421 {
9422   rtx elt;
9423   switch (code)
9424     {
9425     case 'c':
9426       switch (GET_CODE (x))
9427         {
9428         case CONST_INT:
9429           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
9430           break;
9431
9432         case SYMBOL_REF:
9433           output_addr_const (f, x);
9434           break;
9435
9436         case CONST:
9437           if (GET_CODE (XEXP (x, 0)) == PLUS
9438               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
9439             {
9440               output_addr_const (f, x);
9441               break;
9442             }
9443           /* Fall through.  */
9444
9445         default:
9446           output_operand_lossage ("unsupported operand for code '%c'", code);
9447         }
9448       break;
9449
9450     case 'e':
9451       {
9452         x = unwrap_const_vec_duplicate (x);
9453         if (!CONST_INT_P (x))
9454           {
9455             output_operand_lossage ("invalid operand for '%%%c'", code);
9456             return;
9457           }
9458
9459         HOST_WIDE_INT val = INTVAL (x);
9460         if ((val & ~7) == 8 || val == 0xff)
9461           fputc ('b', f);
9462         else if ((val & ~7) == 16 || val == 0xffff)
9463           fputc ('h', f);
9464         else if ((val & ~7) == 32 || val == 0xffffffff)
9465           fputc ('w', f);
9466         else
9467           {
9468             output_operand_lossage ("invalid operand for '%%%c'", code);
9469             return;
9470           }
9471       }
9472       break;
9473
9474     case 'p':
9475       {
9476         int n;
9477
9478         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
9479           {
9480             output_operand_lossage ("invalid operand for '%%%c'", code);
9481             return;
9482           }
9483
9484         asm_fprintf (f, "%d", n);
9485       }
9486       break;
9487
9488     case 'P':
9489       if (!CONST_INT_P (x))
9490         {
9491           output_operand_lossage ("invalid operand for '%%%c'", code);
9492           return;
9493         }
9494
9495       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
9496       break;
9497
9498     case 'H':
9499       if (x == const0_rtx)
9500         {
9501           asm_fprintf (f, "xzr");
9502           break;
9503         }
9504
9505       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
9506         {
9507           output_operand_lossage ("invalid operand for '%%%c'", code);
9508           return;
9509         }
9510
9511       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
9512       break;
9513
9514     case 'I':
9515       {
9516         x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
9517         if (CONST_INT_P (x))
9518           asm_fprintf (f, "%wd", INTVAL (x));
9519         else
9520           {
9521             output_operand_lossage ("invalid operand for '%%%c'", code);
9522             return;
9523           }
9524         break;
9525       }
9526
9527     case 'M':
9528     case 'm':
9529       {
9530         int cond_code;
9531         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
9532         if (x == const_true_rtx)
9533           {
9534             if (code == 'M')
9535               fputs ("nv", f);
9536             return;
9537           }
9538
9539         if (!COMPARISON_P (x))
9540           {
9541             output_operand_lossage ("invalid operand for '%%%c'", code);
9542             return;
9543           }
9544
9545         cond_code = aarch64_get_condition_code (x);
9546         gcc_assert (cond_code >= 0);
9547         if (code == 'M')
9548           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
9549         if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
9550           fputs (aarch64_sve_condition_codes[cond_code], f);
9551         else
9552           fputs (aarch64_condition_codes[cond_code], f);
9553       }
9554       break;
9555
9556     case 'N':
9557       if (!const_vec_duplicate_p (x, &elt))
9558         {
9559           output_operand_lossage ("invalid vector constant");
9560           return;
9561         }
9562
9563       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
9564         asm_fprintf (f, "%wd", -INTVAL (elt));
9565       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
9566                && aarch64_print_vector_float_operand (f, x, true))
9567         ;
9568       else
9569         {
9570           output_operand_lossage ("invalid vector constant");
9571           return;
9572         }
9573       break;
9574
9575     case 'b':
9576     case 'h':
9577     case 's':
9578     case 'd':
9579     case 'q':
9580       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
9581         {
9582           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
9583           return;
9584         }
9585       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
9586       break;
9587
9588     case 'S':
9589     case 'T':
9590     case 'U':
9591     case 'V':
9592       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
9593         {
9594           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
9595           return;
9596         }
9597       asm_fprintf (f, "%c%d",
9598                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
9599                    REGNO (x) - V0_REGNUM + (code - 'S'));
9600       break;
9601
9602     case 'R':
9603       if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
9604         asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
9605       else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
9606         asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
9607       else
9608         output_operand_lossage ("incompatible register operand for '%%%c'",
9609                                 code);
9610       break;
9611
9612     case 'X':
9613       if (!CONST_INT_P (x))
9614         {
9615           output_operand_lossage ("invalid operand for '%%%c'", code);
9616           return;
9617         }
9618       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
9619       break;
9620
9621     case 'C':
9622       {
9623         /* Print a replicated constant in hex.  */
9624         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
9625           {
9626             output_operand_lossage ("invalid operand for '%%%c'", code);
9627             return;
9628           }
9629         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
9630         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
9631       }
9632       break;
9633
9634     case 'D':
9635       {
9636         /* Print a replicated constant in decimal, treating it as
9637            unsigned.  */
9638         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
9639           {
9640             output_operand_lossage ("invalid operand for '%%%c'", code);
9641             return;
9642           }
9643         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
9644         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
9645       }
9646       break;
9647
9648     case 'w':
9649     case 'x':
9650       if (x == const0_rtx
9651           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
9652         {
9653           asm_fprintf (f, "%czr", code);
9654           break;
9655         }
9656
9657       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
9658         {
9659           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
9660           break;
9661         }
9662
9663       if (REG_P (x) && REGNO (x) == SP_REGNUM)
9664         {
9665           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
9666           break;
9667         }
9668
9669       /* Fall through */
9670
9671     case 0:
9672       if (x == NULL)
9673         {
9674           output_operand_lossage ("missing operand");
9675           return;
9676         }
9677
9678       switch (GET_CODE (x))
9679         {
9680         case REG:
9681           if (aarch64_sve_data_mode_p (GET_MODE (x)))
9682             {
9683               if (REG_NREGS (x) == 1)
9684                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
9685               else
9686                 {
9687                   char suffix
9688                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
9689                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
9690                                REGNO (x) - V0_REGNUM, suffix,
9691                                END_REGNO (x) - V0_REGNUM - 1, suffix);
9692                 }
9693             }
9694           else
9695             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
9696           break;
9697
9698         case MEM:
9699           output_address (GET_MODE (x), XEXP (x, 0));
9700           break;
9701
9702         case LABEL_REF:
9703         case SYMBOL_REF:
9704           output_addr_const (asm_out_file, x);
9705           break;
9706
9707         case CONST_INT:
9708           asm_fprintf (f, "%wd", INTVAL (x));
9709           break;
9710
9711         case CONST:
9712           if (!VECTOR_MODE_P (GET_MODE (x)))
9713             {
9714               output_addr_const (asm_out_file, x);
9715               break;
9716             }
9717           /* fall through */
9718
9719         case CONST_VECTOR:
9720           if (!const_vec_duplicate_p (x, &elt))
9721             {
9722               output_operand_lossage ("invalid vector constant");
9723               return;
9724             }
9725
9726           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
9727             asm_fprintf (f, "%wd", INTVAL (elt));
9728           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
9729                    && aarch64_print_vector_float_operand (f, x, false))
9730             ;
9731           else
9732             {
9733               output_operand_lossage ("invalid vector constant");
9734               return;
9735             }
9736           break;
9737
9738         case CONST_DOUBLE:
9739           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
9740              be getting CONST_DOUBLEs holding integers.  */
9741           gcc_assert (GET_MODE (x) != VOIDmode);
9742           if (aarch64_float_const_zero_rtx_p (x))
9743             {
9744               fputc ('0', f);
9745               break;
9746             }
9747           else if (aarch64_float_const_representable_p (x))
9748             {
9749 #define buf_size 20
9750               char float_buf[buf_size] = {'\0'};
9751               real_to_decimal_for_mode (float_buf,
9752                                         CONST_DOUBLE_REAL_VALUE (x),
9753                                         buf_size, buf_size,
9754                                         1, GET_MODE (x));
9755               asm_fprintf (asm_out_file, "%s", float_buf);
9756               break;
9757 #undef buf_size
9758             }
9759           output_operand_lossage ("invalid constant");
9760           return;
9761         default:
9762           output_operand_lossage ("invalid operand");
9763           return;
9764         }
9765       break;
9766
9767     case 'A':
9768       if (GET_CODE (x) == HIGH)
9769         x = XEXP (x, 0);
9770
9771       switch (aarch64_classify_symbolic_expression (x))
9772         {
9773         case SYMBOL_SMALL_GOT_4G:
9774           asm_fprintf (asm_out_file, ":got:");
9775           break;
9776
9777         case SYMBOL_SMALL_TLSGD:
9778           asm_fprintf (asm_out_file, ":tlsgd:");
9779           break;
9780
9781         case SYMBOL_SMALL_TLSDESC:
9782           asm_fprintf (asm_out_file, ":tlsdesc:");
9783           break;
9784
9785         case SYMBOL_SMALL_TLSIE:
9786           asm_fprintf (asm_out_file, ":gottprel:");
9787           break;
9788
9789         case SYMBOL_TLSLE24:
9790           asm_fprintf (asm_out_file, ":tprel:");
9791           break;
9792
9793         case SYMBOL_TINY_GOT:
9794           gcc_unreachable ();
9795           break;
9796
9797         default:
9798           break;
9799         }
9800       output_addr_const (asm_out_file, x);
9801       break;
9802
9803     case 'L':
9804       switch (aarch64_classify_symbolic_expression (x))
9805         {
9806         case SYMBOL_SMALL_GOT_4G:
9807           asm_fprintf (asm_out_file, ":lo12:");
9808           break;
9809
9810         case SYMBOL_SMALL_TLSGD:
9811           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
9812           break;
9813
9814         case SYMBOL_SMALL_TLSDESC:
9815           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
9816           break;
9817
9818         case SYMBOL_SMALL_TLSIE:
9819           asm_fprintf (asm_out_file, ":gottprel_lo12:");
9820           break;
9821
9822         case SYMBOL_TLSLE12:
9823           asm_fprintf (asm_out_file, ":tprel_lo12:");
9824           break;
9825
9826         case SYMBOL_TLSLE24:
9827           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
9828           break;
9829
9830         case SYMBOL_TINY_GOT:
9831           asm_fprintf (asm_out_file, ":got:");
9832           break;
9833
9834         case SYMBOL_TINY_TLSIE:
9835           asm_fprintf (asm_out_file, ":gottprel:");
9836           break;
9837
9838         default:
9839           break;
9840         }
9841       output_addr_const (asm_out_file, x);
9842       break;
9843
9844     case 'G':
9845       switch (aarch64_classify_symbolic_expression (x))
9846         {
9847         case SYMBOL_TLSLE24:
9848           asm_fprintf (asm_out_file, ":tprel_hi12:");
9849           break;
9850         default:
9851           break;
9852         }
9853       output_addr_const (asm_out_file, x);
9854       break;
9855
9856     case 'k':
9857       {
9858         HOST_WIDE_INT cond_code;
9859
9860         if (!CONST_INT_P (x))
9861           {
9862             output_operand_lossage ("invalid operand for '%%%c'", code);
9863             return;
9864           }
9865
9866         cond_code = INTVAL (x);
9867         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
9868         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
9869       }
9870       break;
9871
9872     case 'y':
9873     case 'z':
9874       {
9875         machine_mode mode = GET_MODE (x);
9876
9877         if (GET_CODE (x) != MEM
9878             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
9879           {
9880             output_operand_lossage ("invalid operand for '%%%c'", code);
9881             return;
9882           }
9883
9884         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
9885                                             code == 'y'
9886                                             ? ADDR_QUERY_LDP_STP_N
9887                                             : ADDR_QUERY_LDP_STP))
9888           output_operand_lossage ("invalid operand prefix '%%%c'", code);
9889       }
9890       break;
9891
9892     default:
9893       output_operand_lossage ("invalid operand prefix '%%%c'", code);
9894       return;
9895     }
9896 }
9897
9898 /* Print address 'x' of a memory access with mode 'mode'.
9899    'op' is the context required by aarch64_classify_address.  It can either be
9900    MEM for a normal memory access or PARALLEL for LDP/STP.  */
9901 static bool
9902 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
9903                                 aarch64_addr_query_type type)
9904 {
9905   struct aarch64_address_info addr;
9906   unsigned int size, vec_flags;
9907
9908   /* Check all addresses are Pmode - including ILP32.  */
9909   if (GET_MODE (x) != Pmode
9910       && (!CONST_INT_P (x)
9911           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
9912     {
9913       output_operand_lossage ("invalid address mode");
9914       return false;
9915     }
9916
9917   if (aarch64_classify_address (&addr, x, mode, true, type))
9918     switch (addr.type)
9919       {
9920       case ADDRESS_REG_IMM:
9921         if (known_eq (addr.const_offset, 0))
9922           {
9923             asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
9924             return true;
9925           }
9926
9927         vec_flags = aarch64_classify_vector_mode (mode);
9928         if (vec_flags & VEC_ANY_SVE)
9929           {
9930             HOST_WIDE_INT vnum
9931               = exact_div (addr.const_offset,
9932                            aarch64_vl_bytes (mode, vec_flags)).to_constant ();
9933             asm_fprintf (f, "[%s, #%wd, mul vl]",
9934                          reg_names[REGNO (addr.base)], vnum);
9935             return true;
9936           }
9937
9938         asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
9939                      INTVAL (addr.offset));
9940         return true;
9941
9942       case ADDRESS_REG_REG:
9943         if (addr.shift == 0)
9944           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
9945                        reg_names [REGNO (addr.offset)]);
9946         else
9947           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
9948                        reg_names [REGNO (addr.offset)], addr.shift);
9949         return true;
9950
9951       case ADDRESS_REG_UXTW:
9952         if (addr.shift == 0)
9953           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
9954                        REGNO (addr.offset) - R0_REGNUM);
9955         else
9956           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
9957                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
9958         return true;
9959
9960       case ADDRESS_REG_SXTW:
9961         if (addr.shift == 0)
9962           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
9963                        REGNO (addr.offset) - R0_REGNUM);
9964         else
9965           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
9966                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
9967         return true;
9968
9969       case ADDRESS_REG_WB:
9970         /* Writeback is only supported for fixed-width modes.  */
9971         size = GET_MODE_SIZE (mode).to_constant ();
9972         switch (GET_CODE (x))
9973           {
9974           case PRE_INC:
9975             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
9976             return true;
9977           case POST_INC:
9978             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
9979             return true;
9980           case PRE_DEC:
9981             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
9982             return true;
9983           case POST_DEC:
9984             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
9985             return true;
9986           case PRE_MODIFY:
9987             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
9988                          INTVAL (addr.offset));
9989             return true;
9990           case POST_MODIFY:
9991             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
9992                          INTVAL (addr.offset));
9993             return true;
9994           default:
9995             break;
9996           }
9997         break;
9998
9999       case ADDRESS_LO_SUM:
10000         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
10001         output_addr_const (f, addr.offset);
10002         asm_fprintf (f, "]");
10003         return true;
10004
10005       case ADDRESS_SYMBOLIC:
10006         output_addr_const (f, x);
10007         return true;
10008       }
10009
10010   return false;
10011 }
10012
10013 /* Print address 'x' of a memory access with mode 'mode'.  */
10014 static void
10015 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
10016 {
10017   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
10018     output_addr_const (f, x);
10019 }
10020
10021 bool
10022 aarch64_label_mentioned_p (rtx x)
10023 {
10024   const char *fmt;
10025   int i;
10026
10027   if (GET_CODE (x) == LABEL_REF)
10028     return true;
10029
10030   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
10031      referencing instruction, but they are constant offsets, not
10032      symbols.  */
10033   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
10034     return false;
10035
10036   fmt = GET_RTX_FORMAT (GET_CODE (x));
10037   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
10038     {
10039       if (fmt[i] == 'E')
10040         {
10041           int j;
10042
10043           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
10044             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
10045               return 1;
10046         }
10047       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
10048         return 1;
10049     }
10050
10051   return 0;
10052 }
10053
10054 /* Implement REGNO_REG_CLASS.  */
10055
10056 enum reg_class
10057 aarch64_regno_regclass (unsigned regno)
10058 {
10059   if (GP_REGNUM_P (regno))
10060     return GENERAL_REGS;
10061
10062   if (regno == SP_REGNUM)
10063     return STACK_REG;
10064
10065   if (regno == FRAME_POINTER_REGNUM
10066       || regno == ARG_POINTER_REGNUM)
10067     return POINTER_REGS;
10068
10069   if (FP_REGNUM_P (regno))
10070     return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
10071             : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
10072
10073   if (PR_REGNUM_P (regno))
10074     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
10075
10076   if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
10077     return FFR_REGS;
10078
10079   return NO_REGS;
10080 }
10081
10082 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
10083    If OFFSET is out of range, return an offset of an anchor point
10084    that is in range.  Return 0 otherwise.  */
10085
10086 static HOST_WIDE_INT
10087 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
10088                        machine_mode mode)
10089 {
10090   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
10091   if (size > 16)
10092     return (offset + 0x400) & ~0x7f0;
10093
10094   /* For offsets that aren't a multiple of the access size, the limit is
10095      -256...255.  */
10096   if (offset & (size - 1))
10097     {
10098       /* BLKmode typically uses LDP of X-registers.  */
10099       if (mode == BLKmode)
10100         return (offset + 512) & ~0x3ff;
10101       return (offset + 0x100) & ~0x1ff;
10102     }
10103
10104   /* Small negative offsets are supported.  */
10105   if (IN_RANGE (offset, -256, 0))
10106     return 0;
10107
10108   if (mode == TImode || mode == TFmode)
10109     return (offset + 0x100) & ~0x1ff;
10110
10111   /* Use 12-bit offset by access size.  */
10112   return offset & (~0xfff * size);
10113 }
10114
10115 static rtx
10116 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
10117 {
10118   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
10119      where mask is selected by alignment and size of the offset.
10120      We try to pick as large a range for the offset as possible to
10121      maximize the chance of a CSE.  However, for aligned addresses
10122      we limit the range to 4k so that structures with different sized
10123      elements are likely to use the same base.  We need to be careful
10124      not to split a CONST for some forms of address expression, otherwise
10125      it will generate sub-optimal code.  */
10126
10127   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
10128     {
10129       rtx base = XEXP (x, 0);
10130       rtx offset_rtx = XEXP (x, 1);
10131       HOST_WIDE_INT offset = INTVAL (offset_rtx);
10132
10133       if (GET_CODE (base) == PLUS)
10134         {
10135           rtx op0 = XEXP (base, 0);
10136           rtx op1 = XEXP (base, 1);
10137
10138           /* Force any scaling into a temp for CSE.  */
10139           op0 = force_reg (Pmode, op0);
10140           op1 = force_reg (Pmode, op1);
10141
10142           /* Let the pointer register be in op0.  */
10143           if (REG_POINTER (op1))
10144             std::swap (op0, op1);
10145
10146           /* If the pointer is virtual or frame related, then we know that
10147              virtual register instantiation or register elimination is going
10148              to apply a second constant.  We want the two constants folded
10149              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
10150           if (virt_or_elim_regno_p (REGNO (op0)))
10151             {
10152               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
10153                                    NULL_RTX, true, OPTAB_DIRECT);
10154               return gen_rtx_PLUS (Pmode, base, op1);
10155             }
10156
10157           /* Otherwise, in order to encourage CSE (and thence loop strength
10158              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
10159           base = expand_binop (Pmode, add_optab, op0, op1,
10160                                NULL_RTX, true, OPTAB_DIRECT);
10161           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
10162         }
10163
10164       HOST_WIDE_INT size;
10165       if (GET_MODE_SIZE (mode).is_constant (&size))
10166         {
10167           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
10168                                                              mode);
10169           if (base_offset != 0)
10170             {
10171               base = plus_constant (Pmode, base, base_offset);
10172               base = force_operand (base, NULL_RTX);
10173               return plus_constant (Pmode, base, offset - base_offset);
10174             }
10175         }
10176     }
10177
10178   return x;
10179 }
10180
10181 static reg_class_t
10182 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
10183                           reg_class_t rclass,
10184                           machine_mode mode,
10185                           secondary_reload_info *sri)
10186 {
10187   /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
10188      LDR and STR.  See the comment at the head of aarch64-sve.md for
10189      more details about the big-endian handling.  */
10190   if (reg_class_subset_p (rclass, FP_REGS)
10191       && !((REG_P (x) && HARD_REGISTER_P (x))
10192            || aarch64_simd_valid_immediate (x, NULL))
10193       && mode != VNx16QImode)
10194     {
10195       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10196       if ((vec_flags & VEC_SVE_DATA)
10197           && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
10198         {
10199           sri->icode = CODE_FOR_aarch64_sve_reload_mem;
10200           return NO_REGS;
10201         }
10202     }
10203
10204   /* If we have to disable direct literal pool loads and stores because the
10205      function is too big, then we need a scratch register.  */
10206   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
10207       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
10208           || targetm.vector_mode_supported_p (GET_MODE (x)))
10209       && !aarch64_pcrelative_literal_loads)
10210     {
10211       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
10212       return NO_REGS;
10213     }
10214
10215   /* Without the TARGET_SIMD instructions we cannot move a Q register
10216      to a Q register directly.  We need a scratch.  */
10217   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
10218       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
10219       && reg_class_subset_p (rclass, FP_REGS))
10220     {
10221       sri->icode = code_for_aarch64_reload_mov (mode);
10222       return NO_REGS;
10223     }
10224
10225   /* A TFmode or TImode memory access should be handled via an FP_REGS
10226      because AArch64 has richer addressing modes for LDR/STR instructions
10227      than LDP/STP instructions.  */
10228   if (TARGET_FLOAT && rclass == GENERAL_REGS
10229       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
10230     return FP_REGS;
10231
10232   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
10233       return GENERAL_REGS;
10234
10235   return NO_REGS;
10236 }
10237
10238 static bool
10239 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
10240 {
10241   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
10242
10243   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
10244      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
10245   if (frame_pointer_needed)
10246     return to == HARD_FRAME_POINTER_REGNUM;
10247   return true;
10248 }
10249
10250 poly_int64
10251 aarch64_initial_elimination_offset (unsigned from, unsigned to)
10252 {
10253   if (to == HARD_FRAME_POINTER_REGNUM)
10254     {
10255       if (from == ARG_POINTER_REGNUM)
10256         return cfun->machine->frame.hard_fp_offset;
10257
10258       if (from == FRAME_POINTER_REGNUM)
10259         return cfun->machine->frame.hard_fp_offset
10260                - cfun->machine->frame.locals_offset;
10261     }
10262
10263   if (to == STACK_POINTER_REGNUM)
10264     {
10265       if (from == FRAME_POINTER_REGNUM)
10266           return cfun->machine->frame.frame_size
10267                  - cfun->machine->frame.locals_offset;
10268     }
10269
10270   return cfun->machine->frame.frame_size;
10271 }
10272
10273 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
10274    previous frame.  */
10275
10276 rtx
10277 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
10278 {
10279   if (count != 0)
10280     return const0_rtx;
10281   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
10282 }
10283
10284
10285 static void
10286 aarch64_asm_trampoline_template (FILE *f)
10287 {
10288   int offset1 = 16;
10289   int offset2 = 20;
10290
10291   if (aarch64_bti_enabled ())
10292     {
10293       asm_fprintf (f, "\thint\t34 // bti c\n");
10294       offset1 -= 4;
10295       offset2 -= 4;
10296     }
10297
10298   if (TARGET_ILP32)
10299     {
10300       asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
10301       asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
10302                    offset1);
10303     }
10304   else
10305     {
10306       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
10307       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
10308                    offset2);
10309     }
10310   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
10311
10312   /* The trampoline needs an extra padding instruction.  In case if BTI is
10313      enabled the padding instruction is replaced by the BTI instruction at
10314      the beginning.  */
10315   if (!aarch64_bti_enabled ())
10316     assemble_aligned_integer (4, const0_rtx);
10317
10318   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
10319   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
10320 }
10321
10322 static void
10323 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
10324 {
10325   rtx fnaddr, mem, a_tramp;
10326   const int tramp_code_sz = 16;
10327
10328   /* Don't need to copy the trailing D-words, we fill those in below.  */
10329   emit_block_move (m_tramp, assemble_trampoline_template (),
10330                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
10331   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
10332   fnaddr = XEXP (DECL_RTL (fndecl), 0);
10333   if (GET_MODE (fnaddr) != ptr_mode)
10334     fnaddr = convert_memory_address (ptr_mode, fnaddr);
10335   emit_move_insn (mem, fnaddr);
10336
10337   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
10338   emit_move_insn (mem, chain_value);
10339
10340   /* XXX We should really define a "clear_cache" pattern and use
10341      gen_clear_cache().  */
10342   a_tramp = XEXP (m_tramp, 0);
10343   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
10344                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
10345                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
10346                      ptr_mode);
10347 }
10348
10349 static unsigned char
10350 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
10351 {
10352   /* ??? Logically we should only need to provide a value when
10353      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
10354      can hold MODE, but at the moment we need to handle all modes.
10355      Just ignore any runtime parts for registers that can't store them.  */
10356   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
10357   unsigned int nregs, vec_flags;
10358   switch (regclass)
10359     {
10360     case TAILCALL_ADDR_REGS:
10361     case POINTER_REGS:
10362     case GENERAL_REGS:
10363     case ALL_REGS:
10364     case POINTER_AND_FP_REGS:
10365     case FP_REGS:
10366     case FP_LO_REGS:
10367     case FP_LO8_REGS:
10368       vec_flags = aarch64_classify_vector_mode (mode);
10369       if ((vec_flags & VEC_SVE_DATA)
10370           && constant_multiple_p (GET_MODE_SIZE (mode),
10371                                   aarch64_vl_bytes (mode, vec_flags), &nregs))
10372         return nregs;
10373       return (vec_flags & VEC_ADVSIMD
10374               ? CEIL (lowest_size, UNITS_PER_VREG)
10375               : CEIL (lowest_size, UNITS_PER_WORD));
10376     case STACK_REG:
10377     case PR_REGS:
10378     case PR_LO_REGS:
10379     case PR_HI_REGS:
10380     case FFR_REGS:
10381     case PR_AND_FFR_REGS:
10382       return 1;
10383
10384     case NO_REGS:
10385       return 0;
10386
10387     default:
10388       break;
10389     }
10390   gcc_unreachable ();
10391 }
10392
10393 static reg_class_t
10394 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
10395 {
10396   if (regclass == POINTER_REGS)
10397     return GENERAL_REGS;
10398
10399   if (regclass == STACK_REG)
10400     {
10401       if (REG_P(x)
10402           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
10403           return regclass;
10404
10405       return NO_REGS;
10406     }
10407
10408   /* Register eliminiation can result in a request for
10409      SP+constant->FP_REGS.  We cannot support such operations which
10410      use SP as source and an FP_REG as destination, so reject out
10411      right now.  */
10412   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
10413     {
10414       rtx lhs = XEXP (x, 0);
10415
10416       /* Look through a possible SUBREG introduced by ILP32.  */
10417       if (GET_CODE (lhs) == SUBREG)
10418         lhs = SUBREG_REG (lhs);
10419
10420       gcc_assert (REG_P (lhs));
10421       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
10422                                       POINTER_REGS));
10423       return NO_REGS;
10424     }
10425
10426   return regclass;
10427 }
10428
10429 void
10430 aarch64_asm_output_labelref (FILE* f, const char *name)
10431 {
10432   asm_fprintf (f, "%U%s", name);
10433 }
10434
10435 static void
10436 aarch64_elf_asm_constructor (rtx symbol, int priority)
10437 {
10438   if (priority == DEFAULT_INIT_PRIORITY)
10439     default_ctor_section_asm_out_constructor (symbol, priority);
10440   else
10441     {
10442       section *s;
10443       /* While priority is known to be in range [0, 65535], so 18 bytes
10444          would be enough, the compiler might not know that.  To avoid
10445          -Wformat-truncation false positive, use a larger size.  */
10446       char buf[23];
10447       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
10448       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
10449       switch_to_section (s);
10450       assemble_align (POINTER_SIZE);
10451       assemble_aligned_integer (POINTER_BYTES, symbol);
10452     }
10453 }
10454
10455 static void
10456 aarch64_elf_asm_destructor (rtx symbol, int priority)
10457 {
10458   if (priority == DEFAULT_INIT_PRIORITY)
10459     default_dtor_section_asm_out_destructor (symbol, priority);
10460   else
10461     {
10462       section *s;
10463       /* While priority is known to be in range [0, 65535], so 18 bytes
10464          would be enough, the compiler might not know that.  To avoid
10465          -Wformat-truncation false positive, use a larger size.  */
10466       char buf[23];
10467       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
10468       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
10469       switch_to_section (s);
10470       assemble_align (POINTER_SIZE);
10471       assemble_aligned_integer (POINTER_BYTES, symbol);
10472     }
10473 }
10474
10475 const char*
10476 aarch64_output_casesi (rtx *operands)
10477 {
10478   char buf[100];
10479   char label[100];
10480   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
10481   int index;
10482   static const char *const patterns[4][2] =
10483   {
10484     {
10485       "ldrb\t%w3, [%0,%w1,uxtw]",
10486       "add\t%3, %4, %w3, sxtb #2"
10487     },
10488     {
10489       "ldrh\t%w3, [%0,%w1,uxtw #1]",
10490       "add\t%3, %4, %w3, sxth #2"
10491     },
10492     {
10493       "ldr\t%w3, [%0,%w1,uxtw #2]",
10494       "add\t%3, %4, %w3, sxtw #2"
10495     },
10496     /* We assume that DImode is only generated when not optimizing and
10497        that we don't really need 64-bit address offsets.  That would
10498        imply an object file with 8GB of code in a single function!  */
10499     {
10500       "ldr\t%w3, [%0,%w1,uxtw #2]",
10501       "add\t%3, %4, %w3, sxtw #2"
10502     }
10503   };
10504
10505   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
10506
10507   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
10508   index = exact_log2 (GET_MODE_SIZE (mode));
10509
10510   gcc_assert (index >= 0 && index <= 3);
10511
10512   /* Need to implement table size reduction, by chaning the code below.  */
10513   output_asm_insn (patterns[index][0], operands);
10514   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
10515   snprintf (buf, sizeof (buf),
10516             "adr\t%%4, %s", targetm.strip_name_encoding (label));
10517   output_asm_insn (buf, operands);
10518   output_asm_insn (patterns[index][1], operands);
10519   output_asm_insn ("br\t%3", operands);
10520   assemble_label (asm_out_file, label);
10521   return "";
10522 }
10523
10524
10525 /* Return size in bits of an arithmetic operand which is shifted/scaled and
10526    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
10527    operator.  */
10528
10529 int
10530 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
10531 {
10532   if (shift >= 0 && shift <= 3)
10533     {
10534       int size;
10535       for (size = 8; size <= 32; size *= 2)
10536         {
10537           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
10538           if (mask == bits << shift)
10539             return size;
10540         }
10541     }
10542   return 0;
10543 }
10544
10545 /* Constant pools are per function only when PC relative
10546    literal loads are true or we are in the large memory
10547    model.  */
10548
10549 static inline bool
10550 aarch64_can_use_per_function_literal_pools_p (void)
10551 {
10552   return (aarch64_pcrelative_literal_loads
10553           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
10554 }
10555
10556 static bool
10557 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
10558 {
10559   /* We can't use blocks for constants when we're using a per-function
10560      constant pool.  */
10561   return !aarch64_can_use_per_function_literal_pools_p ();
10562 }
10563
10564 /* Select appropriate section for constants depending
10565    on where we place literal pools.  */
10566
10567 static section *
10568 aarch64_select_rtx_section (machine_mode mode,
10569                             rtx x,
10570                             unsigned HOST_WIDE_INT align)
10571 {
10572   if (aarch64_can_use_per_function_literal_pools_p ())
10573     return function_section (current_function_decl);
10574
10575   return default_elf_select_rtx_section (mode, x, align);
10576 }
10577
10578 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
10579 void
10580 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
10581                                   HOST_WIDE_INT offset)
10582 {
10583   /* When using per-function literal pools, we must ensure that any code
10584      section is aligned to the minimal instruction length, lest we get
10585      errors from the assembler re "unaligned instructions".  */
10586   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
10587     ASM_OUTPUT_ALIGN (f, 2);
10588 }
10589
10590 /* Costs.  */
10591
10592 /* Helper function for rtx cost calculation.  Strip a shift expression
10593    from X.  Returns the inner operand if successful, or the original
10594    expression on failure.  */
10595 static rtx
10596 aarch64_strip_shift (rtx x)
10597 {
10598   rtx op = x;
10599
10600   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
10601      we can convert both to ROR during final output.  */
10602   if ((GET_CODE (op) == ASHIFT
10603        || GET_CODE (op) == ASHIFTRT
10604        || GET_CODE (op) == LSHIFTRT
10605        || GET_CODE (op) == ROTATERT
10606        || GET_CODE (op) == ROTATE)
10607       && CONST_INT_P (XEXP (op, 1)))
10608     return XEXP (op, 0);
10609
10610   if (GET_CODE (op) == MULT
10611       && CONST_INT_P (XEXP (op, 1))
10612       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
10613     return XEXP (op, 0);
10614
10615   return x;
10616 }
10617
10618 /* Helper function for rtx cost calculation.  Strip an extend
10619    expression from X.  Returns the inner operand if successful, or the
10620    original expression on failure.  We deal with a number of possible
10621    canonicalization variations here. If STRIP_SHIFT is true, then
10622    we can strip off a shift also.  */
10623 static rtx
10624 aarch64_strip_extend (rtx x, bool strip_shift)
10625 {
10626   scalar_int_mode mode;
10627   rtx op = x;
10628
10629   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
10630     return op;
10631
10632   /* Zero and sign extraction of a widened value.  */
10633   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
10634       && XEXP (op, 2) == const0_rtx
10635       && GET_CODE (XEXP (op, 0)) == MULT
10636       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
10637                                          XEXP (op, 1)))
10638     return XEXP (XEXP (op, 0), 0);
10639
10640   /* It can also be represented (for zero-extend) as an AND with an
10641      immediate.  */
10642   if (GET_CODE (op) == AND
10643       && GET_CODE (XEXP (op, 0)) == MULT
10644       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
10645       && CONST_INT_P (XEXP (op, 1))
10646       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
10647                            INTVAL (XEXP (op, 1))) != 0)
10648     return XEXP (XEXP (op, 0), 0);
10649
10650   /* Now handle extended register, as this may also have an optional
10651      left shift by 1..4.  */
10652   if (strip_shift
10653       && GET_CODE (op) == ASHIFT
10654       && CONST_INT_P (XEXP (op, 1))
10655       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
10656     op = XEXP (op, 0);
10657
10658   if (GET_CODE (op) == ZERO_EXTEND
10659       || GET_CODE (op) == SIGN_EXTEND)
10660     op = XEXP (op, 0);
10661
10662   if (op != x)
10663     return op;
10664
10665   return x;
10666 }
10667
10668 /* Return true iff CODE is a shift supported in combination
10669    with arithmetic instructions.  */
10670
10671 static bool
10672 aarch64_shift_p (enum rtx_code code)
10673 {
10674   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
10675 }
10676
10677
10678 /* Return true iff X is a cheap shift without a sign extend. */
10679
10680 static bool
10681 aarch64_cheap_mult_shift_p (rtx x)
10682 {
10683   rtx op0, op1;
10684
10685   op0 = XEXP (x, 0);
10686   op1 = XEXP (x, 1);
10687
10688   if (!(aarch64_tune_params.extra_tuning_flags
10689                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
10690     return false;
10691
10692   if (GET_CODE (op0) == SIGN_EXTEND)
10693     return false;
10694
10695   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
10696       && UINTVAL (op1) <= 4)
10697     return true;
10698
10699   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
10700     return false;
10701
10702   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
10703
10704   if (l2 > 0 && l2 <= 4)
10705     return true;
10706
10707   return false;
10708 }
10709
10710 /* Helper function for rtx cost calculation.  Calculate the cost of
10711    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
10712    Return the calculated cost of the expression, recursing manually in to
10713    operands where needed.  */
10714
10715 static int
10716 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
10717 {
10718   rtx op0, op1;
10719   const struct cpu_cost_table *extra_cost
10720     = aarch64_tune_params.insn_extra_cost;
10721   int cost = 0;
10722   bool compound_p = (outer == PLUS || outer == MINUS);
10723   machine_mode mode = GET_MODE (x);
10724
10725   gcc_checking_assert (code == MULT);
10726
10727   op0 = XEXP (x, 0);
10728   op1 = XEXP (x, 1);
10729
10730   if (VECTOR_MODE_P (mode))
10731     mode = GET_MODE_INNER (mode);
10732
10733   /* Integer multiply/fma.  */
10734   if (GET_MODE_CLASS (mode) == MODE_INT)
10735     {
10736       /* The multiply will be canonicalized as a shift, cost it as such.  */
10737       if (aarch64_shift_p (GET_CODE (x))
10738           || (CONST_INT_P (op1)
10739               && exact_log2 (INTVAL (op1)) > 0))
10740         {
10741           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
10742                            || GET_CODE (op0) == SIGN_EXTEND;
10743           if (speed)
10744             {
10745               if (compound_p)
10746                 {
10747                   /* If the shift is considered cheap,
10748                      then don't add any cost. */
10749                   if (aarch64_cheap_mult_shift_p (x))
10750                     ;
10751                   else if (REG_P (op1))
10752                     /* ARITH + shift-by-register.  */
10753                     cost += extra_cost->alu.arith_shift_reg;
10754                   else if (is_extend)
10755                     /* ARITH + extended register.  We don't have a cost field
10756                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
10757                     cost += extra_cost->alu.extend_arith;
10758                   else
10759                     /* ARITH + shift-by-immediate.  */
10760                     cost += extra_cost->alu.arith_shift;
10761                 }
10762               else
10763                 /* LSL (immediate).  */
10764                 cost += extra_cost->alu.shift;
10765
10766             }
10767           /* Strip extends as we will have costed them in the case above.  */
10768           if (is_extend)
10769             op0 = aarch64_strip_extend (op0, true);
10770
10771           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
10772
10773           return cost;
10774         }
10775
10776       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
10777          compound and let the below cases handle it.  After all, MNEG is a
10778          special-case alias of MSUB.  */
10779       if (GET_CODE (op0) == NEG)
10780         {
10781           op0 = XEXP (op0, 0);
10782           compound_p = true;
10783         }
10784
10785       /* Integer multiplies or FMAs have zero/sign extending variants.  */
10786       if ((GET_CODE (op0) == ZERO_EXTEND
10787            && GET_CODE (op1) == ZERO_EXTEND)
10788           || (GET_CODE (op0) == SIGN_EXTEND
10789               && GET_CODE (op1) == SIGN_EXTEND))
10790         {
10791           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
10792           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
10793
10794           if (speed)
10795             {
10796               if (compound_p)
10797                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
10798                 cost += extra_cost->mult[0].extend_add;
10799               else
10800                 /* MUL/SMULL/UMULL.  */
10801                 cost += extra_cost->mult[0].extend;
10802             }
10803
10804           return cost;
10805         }
10806
10807       /* This is either an integer multiply or a MADD.  In both cases
10808          we want to recurse and cost the operands.  */
10809       cost += rtx_cost (op0, mode, MULT, 0, speed);
10810       cost += rtx_cost (op1, mode, MULT, 1, speed);
10811
10812       if (speed)
10813         {
10814           if (compound_p)
10815             /* MADD/MSUB.  */
10816             cost += extra_cost->mult[mode == DImode].add;
10817           else
10818             /* MUL.  */
10819             cost += extra_cost->mult[mode == DImode].simple;
10820         }
10821
10822       return cost;
10823     }
10824   else
10825     {
10826       if (speed)
10827         {
10828           /* Floating-point FMA/FMUL can also support negations of the
10829              operands, unless the rounding mode is upward or downward in
10830              which case FNMUL is different than FMUL with operand negation.  */
10831           bool neg0 = GET_CODE (op0) == NEG;
10832           bool neg1 = GET_CODE (op1) == NEG;
10833           if (compound_p || !flag_rounding_math || (neg0 && neg1))
10834             {
10835               if (neg0)
10836                 op0 = XEXP (op0, 0);
10837               if (neg1)
10838                 op1 = XEXP (op1, 0);
10839             }
10840
10841           if (compound_p)
10842             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
10843             cost += extra_cost->fp[mode == DFmode].fma;
10844           else
10845             /* FMUL/FNMUL.  */
10846             cost += extra_cost->fp[mode == DFmode].mult;
10847         }
10848
10849       cost += rtx_cost (op0, mode, MULT, 0, speed);
10850       cost += rtx_cost (op1, mode, MULT, 1, speed);
10851       return cost;
10852     }
10853 }
10854
10855 static int
10856 aarch64_address_cost (rtx x,
10857                       machine_mode mode,
10858                       addr_space_t as ATTRIBUTE_UNUSED,
10859                       bool speed)
10860 {
10861   enum rtx_code c = GET_CODE (x);
10862   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
10863   struct aarch64_address_info info;
10864   int cost = 0;
10865   info.shift = 0;
10866
10867   if (!aarch64_classify_address (&info, x, mode, false))
10868     {
10869       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
10870         {
10871           /* This is a CONST or SYMBOL ref which will be split
10872              in a different way depending on the code model in use.
10873              Cost it through the generic infrastructure.  */
10874           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
10875           /* Divide through by the cost of one instruction to
10876              bring it to the same units as the address costs.  */
10877           cost_symbol_ref /= COSTS_N_INSNS (1);
10878           /* The cost is then the cost of preparing the address,
10879              followed by an immediate (possibly 0) offset.  */
10880           return cost_symbol_ref + addr_cost->imm_offset;
10881         }
10882       else
10883         {
10884           /* This is most likely a jump table from a case
10885              statement.  */
10886           return addr_cost->register_offset;
10887         }
10888     }
10889
10890   switch (info.type)
10891     {
10892       case ADDRESS_LO_SUM:
10893       case ADDRESS_SYMBOLIC:
10894       case ADDRESS_REG_IMM:
10895         cost += addr_cost->imm_offset;
10896         break;
10897
10898       case ADDRESS_REG_WB:
10899         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
10900           cost += addr_cost->pre_modify;
10901         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
10902           cost += addr_cost->post_modify;
10903         else
10904           gcc_unreachable ();
10905
10906         break;
10907
10908       case ADDRESS_REG_REG:
10909         cost += addr_cost->register_offset;
10910         break;
10911
10912       case ADDRESS_REG_SXTW:
10913         cost += addr_cost->register_sextend;
10914         break;
10915
10916       case ADDRESS_REG_UXTW:
10917         cost += addr_cost->register_zextend;
10918         break;
10919
10920       default:
10921         gcc_unreachable ();
10922     }
10923
10924
10925   if (info.shift > 0)
10926     {
10927       /* For the sake of calculating the cost of the shifted register
10928          component, we can treat same sized modes in the same way.  */
10929       if (known_eq (GET_MODE_BITSIZE (mode), 16))
10930         cost += addr_cost->addr_scale_costs.hi;
10931       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
10932         cost += addr_cost->addr_scale_costs.si;
10933       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
10934         cost += addr_cost->addr_scale_costs.di;
10935       else
10936         /* We can't tell, or this is a 128-bit vector.  */
10937         cost += addr_cost->addr_scale_costs.ti;
10938     }
10939
10940   return cost;
10941 }
10942
10943 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
10944    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
10945    to be taken.  */
10946
10947 int
10948 aarch64_branch_cost (bool speed_p, bool predictable_p)
10949 {
10950   /* When optimizing for speed, use the cost of unpredictable branches.  */
10951   const struct cpu_branch_cost *branch_costs =
10952     aarch64_tune_params.branch_costs;
10953
10954   if (!speed_p || predictable_p)
10955     return branch_costs->predictable;
10956   else
10957     return branch_costs->unpredictable;
10958 }
10959
10960 /* Return true if the RTX X in mode MODE is a zero or sign extract
10961    usable in an ADD or SUB (extended register) instruction.  */
10962 static bool
10963 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
10964 {
10965   /* Catch add with a sign extract.
10966      This is add_<optab><mode>_multp2.  */
10967   if (GET_CODE (x) == SIGN_EXTRACT
10968       || GET_CODE (x) == ZERO_EXTRACT)
10969     {
10970       rtx op0 = XEXP (x, 0);
10971       rtx op1 = XEXP (x, 1);
10972       rtx op2 = XEXP (x, 2);
10973
10974       if (GET_CODE (op0) == MULT
10975           && CONST_INT_P (op1)
10976           && op2 == const0_rtx
10977           && CONST_INT_P (XEXP (op0, 1))
10978           && aarch64_is_extend_from_extract (mode,
10979                                              XEXP (op0, 1),
10980                                              op1))
10981         {
10982           return true;
10983         }
10984     }
10985   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
10986      No shift.  */
10987   else if (GET_CODE (x) == SIGN_EXTEND
10988            || GET_CODE (x) == ZERO_EXTEND)
10989     return REG_P (XEXP (x, 0));
10990
10991   return false;
10992 }
10993
10994 static bool
10995 aarch64_frint_unspec_p (unsigned int u)
10996 {
10997   switch (u)
10998     {
10999       case UNSPEC_FRINTZ:
11000       case UNSPEC_FRINTP:
11001       case UNSPEC_FRINTM:
11002       case UNSPEC_FRINTA:
11003       case UNSPEC_FRINTN:
11004       case UNSPEC_FRINTX:
11005       case UNSPEC_FRINTI:
11006         return true;
11007
11008       default:
11009         return false;
11010     }
11011 }
11012
11013 /* Return true iff X is an rtx that will match an extr instruction
11014    i.e. as described in the *extr<mode>5_insn family of patterns.
11015    OP0 and OP1 will be set to the operands of the shifts involved
11016    on success and will be NULL_RTX otherwise.  */
11017
11018 static bool
11019 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
11020 {
11021   rtx op0, op1;
11022   scalar_int_mode mode;
11023   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
11024     return false;
11025
11026   *res_op0 = NULL_RTX;
11027   *res_op1 = NULL_RTX;
11028
11029   if (GET_CODE (x) != IOR)
11030     return false;
11031
11032   op0 = XEXP (x, 0);
11033   op1 = XEXP (x, 1);
11034
11035   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
11036       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
11037     {
11038      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
11039       if (GET_CODE (op1) == ASHIFT)
11040         std::swap (op0, op1);
11041
11042       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
11043         return false;
11044
11045       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
11046       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
11047
11048       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
11049           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
11050         {
11051           *res_op0 = XEXP (op0, 0);
11052           *res_op1 = XEXP (op1, 0);
11053           return true;
11054         }
11055     }
11056
11057   return false;
11058 }
11059
11060 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
11061    storing it in *COST.  Result is true if the total cost of the operation
11062    has now been calculated.  */
11063 static bool
11064 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
11065 {
11066   rtx inner;
11067   rtx comparator;
11068   enum rtx_code cmpcode;
11069   const struct cpu_cost_table *extra_cost
11070     = aarch64_tune_params.insn_extra_cost;
11071
11072   if (COMPARISON_P (op0))
11073     {
11074       inner = XEXP (op0, 0);
11075       comparator = XEXP (op0, 1);
11076       cmpcode = GET_CODE (op0);
11077     }
11078   else
11079     {
11080       inner = op0;
11081       comparator = const0_rtx;
11082       cmpcode = NE;
11083     }
11084
11085   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
11086     {
11087       /* Conditional branch.  */
11088       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
11089         return true;
11090       else
11091         {
11092           if (cmpcode == NE || cmpcode == EQ)
11093             {
11094               if (comparator == const0_rtx)
11095                 {
11096                   /* TBZ/TBNZ/CBZ/CBNZ.  */
11097                   if (GET_CODE (inner) == ZERO_EXTRACT)
11098                     /* TBZ/TBNZ.  */
11099                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
11100                                        ZERO_EXTRACT, 0, speed);
11101                   else
11102                     /* CBZ/CBNZ.  */
11103                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
11104
11105                   return true;
11106                 }
11107               if (register_operand (inner, VOIDmode)
11108                   && aarch64_imm24 (comparator, VOIDmode))
11109                 {
11110                   /* SUB and SUBS.  */
11111                   *cost += COSTS_N_INSNS (2);
11112                   if (speed)
11113                     *cost += extra_cost->alu.arith * 2;
11114                   return true;
11115                 }
11116             }
11117           else if (cmpcode == LT || cmpcode == GE)
11118             {
11119               /* TBZ/TBNZ.  */
11120               if (comparator == const0_rtx)
11121                 return true;
11122             }
11123         }
11124     }
11125   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
11126     {
11127       /* CCMP.  */
11128       if (GET_CODE (op1) == COMPARE)
11129         {
11130           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
11131           if (XEXP (op1, 1) == const0_rtx)
11132             *cost += 1;
11133           if (speed)
11134             {
11135               machine_mode mode = GET_MODE (XEXP (op1, 0));
11136               const struct cpu_cost_table *extra_cost
11137                 = aarch64_tune_params.insn_extra_cost;
11138
11139               if (GET_MODE_CLASS (mode) == MODE_INT)
11140                 *cost += extra_cost->alu.arith;
11141               else
11142                 *cost += extra_cost->fp[mode == DFmode].compare;
11143             }
11144           return true;
11145         }
11146
11147       /* It's a conditional operation based on the status flags,
11148          so it must be some flavor of CSEL.  */
11149
11150       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
11151       if (GET_CODE (op1) == NEG
11152           || GET_CODE (op1) == NOT
11153           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
11154         op1 = XEXP (op1, 0);
11155       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
11156         {
11157           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
11158           op1 = XEXP (op1, 0);
11159           op2 = XEXP (op2, 0);
11160         }
11161
11162       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
11163       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
11164       return true;
11165     }
11166
11167   /* We don't know what this is, cost all operands.  */
11168   return false;
11169 }
11170
11171 /* Check whether X is a bitfield operation of the form shift + extend that
11172    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
11173    operand to which the bitfield operation is applied.  Otherwise return
11174    NULL_RTX.  */
11175
11176 static rtx
11177 aarch64_extend_bitfield_pattern_p (rtx x)
11178 {
11179   rtx_code outer_code = GET_CODE (x);
11180   machine_mode outer_mode = GET_MODE (x);
11181
11182   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
11183       && outer_mode != SImode && outer_mode != DImode)
11184     return NULL_RTX;
11185
11186   rtx inner = XEXP (x, 0);
11187   rtx_code inner_code = GET_CODE (inner);
11188   machine_mode inner_mode = GET_MODE (inner);
11189   rtx op = NULL_RTX;
11190
11191   switch (inner_code)
11192     {
11193       case ASHIFT:
11194         if (CONST_INT_P (XEXP (inner, 1))
11195             && (inner_mode == QImode || inner_mode == HImode))
11196           op = XEXP (inner, 0);
11197         break;
11198       case LSHIFTRT:
11199         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
11200             && (inner_mode == QImode || inner_mode == HImode))
11201           op = XEXP (inner, 0);
11202         break;
11203       case ASHIFTRT:
11204         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
11205             && (inner_mode == QImode || inner_mode == HImode))
11206           op = XEXP (inner, 0);
11207         break;
11208       default:
11209         break;
11210     }
11211
11212   return op;
11213 }
11214
11215 /* Return true if the mask and a shift amount from an RTX of the form
11216    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
11217    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
11218
11219 bool
11220 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
11221                                     rtx shft_amnt)
11222 {
11223   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
11224          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
11225          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
11226          && (INTVAL (mask)
11227              & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
11228 }
11229
11230 /* Return true if the masks and a shift amount from an RTX of the form
11231    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
11232    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
11233
11234 bool
11235 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
11236                                    unsigned HOST_WIDE_INT mask1,
11237                                    unsigned HOST_WIDE_INT shft_amnt,
11238                                    unsigned HOST_WIDE_INT mask2)
11239 {
11240   unsigned HOST_WIDE_INT t;
11241
11242   /* Verify that there is no overlap in what bits are set in the two masks.  */
11243   if (mask1 != ~mask2)
11244     return false;
11245
11246   /* Verify that mask2 is not all zeros or ones.  */
11247   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
11248     return false;
11249
11250   /* The shift amount should always be less than the mode size.  */
11251   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
11252
11253   /* Verify that the mask being shifted is contiguous and would be in the
11254      least significant bits after shifting by shft_amnt.  */
11255   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
11256   return (t == (t & -t));
11257 }
11258
11259 /* Calculate the cost of calculating X, storing it in *COST.  Result
11260    is true if the total cost of the operation has now been calculated.  */
11261 static bool
11262 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
11263                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
11264 {
11265   rtx op0, op1, op2;
11266   const struct cpu_cost_table *extra_cost
11267     = aarch64_tune_params.insn_extra_cost;
11268   int code = GET_CODE (x);
11269   scalar_int_mode int_mode;
11270
11271   /* By default, assume that everything has equivalent cost to the
11272      cheapest instruction.  Any additional costs are applied as a delta
11273      above this default.  */
11274   *cost = COSTS_N_INSNS (1);
11275
11276   switch (code)
11277     {
11278     case SET:
11279       /* The cost depends entirely on the operands to SET.  */
11280       *cost = 0;
11281       op0 = SET_DEST (x);
11282       op1 = SET_SRC (x);
11283
11284       switch (GET_CODE (op0))
11285         {
11286         case MEM:
11287           if (speed)
11288             {
11289               rtx address = XEXP (op0, 0);
11290               if (VECTOR_MODE_P (mode))
11291                 *cost += extra_cost->ldst.storev;
11292               else if (GET_MODE_CLASS (mode) == MODE_INT)
11293                 *cost += extra_cost->ldst.store;
11294               else if (mode == SFmode)
11295                 *cost += extra_cost->ldst.storef;
11296               else if (mode == DFmode)
11297                 *cost += extra_cost->ldst.stored;
11298
11299               *cost +=
11300                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11301                                                      0, speed));
11302             }
11303
11304           *cost += rtx_cost (op1, mode, SET, 1, speed);
11305           return true;
11306
11307         case SUBREG:
11308           if (! REG_P (SUBREG_REG (op0)))
11309             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
11310
11311           /* Fall through.  */
11312         case REG:
11313           /* The cost is one per vector-register copied.  */
11314           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
11315             {
11316               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
11317               *cost = COSTS_N_INSNS (nregs);
11318             }
11319           /* const0_rtx is in general free, but we will use an
11320              instruction to set a register to 0.  */
11321           else if (REG_P (op1) || op1 == const0_rtx)
11322             {
11323               /* The cost is 1 per register copied.  */
11324               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
11325               *cost = COSTS_N_INSNS (nregs);
11326             }
11327           else
11328             /* Cost is just the cost of the RHS of the set.  */
11329             *cost += rtx_cost (op1, mode, SET, 1, speed);
11330           return true;
11331
11332         case ZERO_EXTRACT:
11333         case SIGN_EXTRACT:
11334           /* Bit-field insertion.  Strip any redundant widening of
11335              the RHS to meet the width of the target.  */
11336           if (GET_CODE (op1) == SUBREG)
11337             op1 = SUBREG_REG (op1);
11338           if ((GET_CODE (op1) == ZERO_EXTEND
11339                || GET_CODE (op1) == SIGN_EXTEND)
11340               && CONST_INT_P (XEXP (op0, 1))
11341               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
11342               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
11343             op1 = XEXP (op1, 0);
11344
11345           if (CONST_INT_P (op1))
11346             {
11347               /* MOV immediate is assumed to always be cheap.  */
11348               *cost = COSTS_N_INSNS (1);
11349             }
11350           else
11351             {
11352               /* BFM.  */
11353               if (speed)
11354                 *cost += extra_cost->alu.bfi;
11355               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
11356             }
11357
11358           return true;
11359
11360         default:
11361           /* We can't make sense of this, assume default cost.  */
11362           *cost = COSTS_N_INSNS (1);
11363           return false;
11364         }
11365       return false;
11366
11367     case CONST_INT:
11368       /* If an instruction can incorporate a constant within the
11369          instruction, the instruction's expression avoids calling
11370          rtx_cost() on the constant.  If rtx_cost() is called on a
11371          constant, then it is usually because the constant must be
11372          moved into a register by one or more instructions.
11373
11374          The exception is constant 0, which can be expressed
11375          as XZR/WZR and is therefore free.  The exception to this is
11376          if we have (set (reg) (const0_rtx)) in which case we must cost
11377          the move.  However, we can catch that when we cost the SET, so
11378          we don't need to consider that here.  */
11379       if (x == const0_rtx)
11380         *cost = 0;
11381       else
11382         {
11383           /* To an approximation, building any other constant is
11384              proportionally expensive to the number of instructions
11385              required to build that constant.  This is true whether we
11386              are compiling for SPEED or otherwise.  */
11387           if (!is_a <scalar_int_mode> (mode, &int_mode))
11388             int_mode = word_mode;
11389           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
11390                                  (NULL_RTX, x, false, int_mode));
11391         }
11392       return true;
11393
11394     case CONST_DOUBLE:
11395
11396       /* First determine number of instructions to do the move
11397           as an integer constant.  */
11398       if (!aarch64_float_const_representable_p (x)
11399            && !aarch64_can_const_movi_rtx_p (x, mode)
11400            && aarch64_float_const_rtx_p (x))
11401         {
11402           unsigned HOST_WIDE_INT ival;
11403           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
11404           gcc_assert (succeed);
11405
11406           scalar_int_mode imode = (mode == HFmode
11407                                    ? SImode
11408                                    : int_mode_for_mode (mode).require ());
11409           int ncost = aarch64_internal_mov_immediate
11410                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
11411           *cost += COSTS_N_INSNS (ncost);
11412           return true;
11413         }
11414
11415       if (speed)
11416         {
11417           /* mov[df,sf]_aarch64.  */
11418           if (aarch64_float_const_representable_p (x))
11419             /* FMOV (scalar immediate).  */
11420             *cost += extra_cost->fp[mode == DFmode].fpconst;
11421           else if (!aarch64_float_const_zero_rtx_p (x))
11422             {
11423               /* This will be a load from memory.  */
11424               if (mode == DFmode)
11425                 *cost += extra_cost->ldst.loadd;
11426               else
11427                 *cost += extra_cost->ldst.loadf;
11428             }
11429           else
11430             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
11431                or MOV v0.s[0], wzr - neither of which are modeled by the
11432                cost tables.  Just use the default cost.  */
11433             {
11434             }
11435         }
11436
11437       return true;
11438
11439     case MEM:
11440       if (speed)
11441         {
11442           /* For loads we want the base cost of a load, plus an
11443              approximation for the additional cost of the addressing
11444              mode.  */
11445           rtx address = XEXP (x, 0);
11446           if (VECTOR_MODE_P (mode))
11447             *cost += extra_cost->ldst.loadv;
11448           else if (GET_MODE_CLASS (mode) == MODE_INT)
11449             *cost += extra_cost->ldst.load;
11450           else if (mode == SFmode)
11451             *cost += extra_cost->ldst.loadf;
11452           else if (mode == DFmode)
11453             *cost += extra_cost->ldst.loadd;
11454
11455           *cost +=
11456                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11457                                                      0, speed));
11458         }
11459
11460       return true;
11461
11462     case NEG:
11463       op0 = XEXP (x, 0);
11464
11465       if (VECTOR_MODE_P (mode))
11466         {
11467           if (speed)
11468             {
11469               /* FNEG.  */
11470               *cost += extra_cost->vect.alu;
11471             }
11472           return false;
11473         }
11474
11475       if (GET_MODE_CLASS (mode) == MODE_INT)
11476         {
11477           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
11478               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
11479             {
11480               /* CSETM.  */
11481               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
11482               return true;
11483             }
11484
11485           /* Cost this as SUB wzr, X.  */
11486           op0 = CONST0_RTX (mode);
11487           op1 = XEXP (x, 0);
11488           goto cost_minus;
11489         }
11490
11491       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11492         {
11493           /* Support (neg(fma...)) as a single instruction only if
11494              sign of zeros is unimportant.  This matches the decision
11495              making in aarch64.md.  */
11496           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
11497             {
11498               /* FNMADD.  */
11499               *cost = rtx_cost (op0, mode, NEG, 0, speed);
11500               return true;
11501             }
11502           if (GET_CODE (op0) == MULT)
11503             {
11504               /* FNMUL.  */
11505               *cost = rtx_cost (op0, mode, NEG, 0, speed);
11506               return true;
11507             }
11508           if (speed)
11509             /* FNEG.  */
11510             *cost += extra_cost->fp[mode == DFmode].neg;
11511           return false;
11512         }
11513
11514       return false;
11515
11516     case CLRSB:
11517     case CLZ:
11518       if (speed)
11519         {
11520           if (VECTOR_MODE_P (mode))
11521             *cost += extra_cost->vect.alu;
11522           else
11523             *cost += extra_cost->alu.clz;
11524         }
11525
11526       return false;
11527
11528     case CTZ:
11529       *cost = COSTS_N_INSNS (2);
11530
11531       if (speed)
11532         *cost += extra_cost->alu.clz + extra_cost->alu.rev;
11533       return false;
11534
11535     case COMPARE:
11536       op0 = XEXP (x, 0);
11537       op1 = XEXP (x, 1);
11538
11539       if (op1 == const0_rtx
11540           && GET_CODE (op0) == AND)
11541         {
11542           x = op0;
11543           mode = GET_MODE (op0);
11544           goto cost_logic;
11545         }
11546
11547       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
11548         {
11549           /* TODO: A write to the CC flags possibly costs extra, this
11550              needs encoding in the cost tables.  */
11551
11552           mode = GET_MODE (op0);
11553           /* ANDS.  */
11554           if (GET_CODE (op0) == AND)
11555             {
11556               x = op0;
11557               goto cost_logic;
11558             }
11559
11560           if (GET_CODE (op0) == PLUS)
11561             {
11562               /* ADDS (and CMN alias).  */
11563               x = op0;
11564               goto cost_plus;
11565             }
11566
11567           if (GET_CODE (op0) == MINUS)
11568             {
11569               /* SUBS.  */
11570               x = op0;
11571               goto cost_minus;
11572             }
11573
11574           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
11575               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
11576               && CONST_INT_P (XEXP (op0, 2)))
11577             {
11578               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
11579                  Handle it here directly rather than going to cost_logic
11580                  since we know the immediate generated for the TST is valid
11581                  so we can avoid creating an intermediate rtx for it only
11582                  for costing purposes.  */
11583               if (speed)
11584                 *cost += extra_cost->alu.logical;
11585
11586               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
11587                                  ZERO_EXTRACT, 0, speed);
11588               return true;
11589             }
11590
11591           if (GET_CODE (op1) == NEG)
11592             {
11593               /* CMN.  */
11594               if (speed)
11595                 *cost += extra_cost->alu.arith;
11596
11597               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
11598               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
11599               return true;
11600             }
11601
11602           /* CMP.
11603
11604              Compare can freely swap the order of operands, and
11605              canonicalization puts the more complex operation first.
11606              But the integer MINUS logic expects the shift/extend
11607              operation in op1.  */
11608           if (! (REG_P (op0)
11609                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
11610           {
11611             op0 = XEXP (x, 1);
11612             op1 = XEXP (x, 0);
11613           }
11614           goto cost_minus;
11615         }
11616
11617       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
11618         {
11619           /* FCMP.  */
11620           if (speed)
11621             *cost += extra_cost->fp[mode == DFmode].compare;
11622
11623           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
11624             {
11625               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
11626               /* FCMP supports constant 0.0 for no extra cost. */
11627               return true;
11628             }
11629           return false;
11630         }
11631
11632       if (VECTOR_MODE_P (mode))
11633         {
11634           /* Vector compare.  */
11635           if (speed)
11636             *cost += extra_cost->vect.alu;
11637
11638           if (aarch64_float_const_zero_rtx_p (op1))
11639             {
11640               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
11641                  cost.  */
11642               return true;
11643             }
11644           return false;
11645         }
11646       return false;
11647
11648     case MINUS:
11649       {
11650         op0 = XEXP (x, 0);
11651         op1 = XEXP (x, 1);
11652
11653 cost_minus:
11654         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
11655
11656         /* Detect valid immediates.  */
11657         if ((GET_MODE_CLASS (mode) == MODE_INT
11658              || (GET_MODE_CLASS (mode) == MODE_CC
11659                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
11660             && CONST_INT_P (op1)
11661             && aarch64_uimm12_shift (INTVAL (op1)))
11662           {
11663             if (speed)
11664               /* SUB(S) (immediate).  */
11665               *cost += extra_cost->alu.arith;
11666             return true;
11667           }
11668
11669         /* Look for SUB (extended register).  */
11670         if (is_a <scalar_int_mode> (mode, &int_mode)
11671             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
11672           {
11673             if (speed)
11674               *cost += extra_cost->alu.extend_arith;
11675
11676             op1 = aarch64_strip_extend (op1, true);
11677             *cost += rtx_cost (op1, VOIDmode,
11678                                (enum rtx_code) GET_CODE (op1), 0, speed);
11679             return true;
11680           }
11681
11682         rtx new_op1 = aarch64_strip_extend (op1, false);
11683
11684         /* Cost this as an FMA-alike operation.  */
11685         if ((GET_CODE (new_op1) == MULT
11686              || aarch64_shift_p (GET_CODE (new_op1)))
11687             && code != COMPARE)
11688           {
11689             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
11690                                             (enum rtx_code) code,
11691                                             speed);
11692             return true;
11693           }
11694
11695         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
11696
11697         if (speed)
11698           {
11699             if (VECTOR_MODE_P (mode))
11700               {
11701                 /* Vector SUB.  */
11702                 *cost += extra_cost->vect.alu;
11703               }
11704             else if (GET_MODE_CLASS (mode) == MODE_INT)
11705               {
11706                 /* SUB(S).  */
11707                 *cost += extra_cost->alu.arith;
11708               }
11709             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11710               {
11711                 /* FSUB.  */
11712                 *cost += extra_cost->fp[mode == DFmode].addsub;
11713               }
11714           }
11715         return true;
11716       }
11717
11718     case PLUS:
11719       {
11720         rtx new_op0;
11721
11722         op0 = XEXP (x, 0);
11723         op1 = XEXP (x, 1);
11724
11725 cost_plus:
11726         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
11727             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
11728           {
11729             /* CSINC.  */
11730             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
11731             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
11732             return true;
11733           }
11734
11735         if (GET_MODE_CLASS (mode) == MODE_INT
11736             && (aarch64_plus_immediate (op1, mode)
11737                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
11738           {
11739             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
11740
11741             if (speed)
11742               /* ADD (immediate).  */
11743               *cost += extra_cost->alu.arith;
11744             return true;
11745           }
11746
11747         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
11748
11749         /* Look for ADD (extended register).  */
11750         if (is_a <scalar_int_mode> (mode, &int_mode)
11751             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
11752           {
11753             if (speed)
11754               *cost += extra_cost->alu.extend_arith;
11755
11756             op0 = aarch64_strip_extend (op0, true);
11757             *cost += rtx_cost (op0, VOIDmode,
11758                                (enum rtx_code) GET_CODE (op0), 0, speed);
11759             return true;
11760           }
11761
11762         /* Strip any extend, leave shifts behind as we will
11763            cost them through mult_cost.  */
11764         new_op0 = aarch64_strip_extend (op0, false);
11765
11766         if (GET_CODE (new_op0) == MULT
11767             || aarch64_shift_p (GET_CODE (new_op0)))
11768           {
11769             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
11770                                             speed);
11771             return true;
11772           }
11773
11774         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
11775
11776         if (speed)
11777           {
11778             if (VECTOR_MODE_P (mode))
11779               {
11780                 /* Vector ADD.  */
11781                 *cost += extra_cost->vect.alu;
11782               }
11783             else if (GET_MODE_CLASS (mode) == MODE_INT)
11784               {
11785                 /* ADD.  */
11786                 *cost += extra_cost->alu.arith;
11787               }
11788             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11789               {
11790                 /* FADD.  */
11791                 *cost += extra_cost->fp[mode == DFmode].addsub;
11792               }
11793           }
11794         return true;
11795       }
11796
11797     case BSWAP:
11798       *cost = COSTS_N_INSNS (1);
11799
11800       if (speed)
11801         {
11802           if (VECTOR_MODE_P (mode))
11803             *cost += extra_cost->vect.alu;
11804           else
11805             *cost += extra_cost->alu.rev;
11806         }
11807       return false;
11808
11809     case IOR:
11810       if (aarch_rev16_p (x))
11811         {
11812           *cost = COSTS_N_INSNS (1);
11813
11814           if (speed)
11815             {
11816               if (VECTOR_MODE_P (mode))
11817                 *cost += extra_cost->vect.alu;
11818               else
11819                 *cost += extra_cost->alu.rev;
11820             }
11821           return true;
11822         }
11823
11824       if (aarch64_extr_rtx_p (x, &op0, &op1))
11825         {
11826           *cost += rtx_cost (op0, mode, IOR, 0, speed);
11827           *cost += rtx_cost (op1, mode, IOR, 1, speed);
11828           if (speed)
11829             *cost += extra_cost->alu.shift;
11830
11831           return true;
11832         }
11833     /* Fall through.  */
11834     case XOR:
11835     case AND:
11836     cost_logic:
11837       op0 = XEXP (x, 0);
11838       op1 = XEXP (x, 1);
11839
11840       if (VECTOR_MODE_P (mode))
11841         {
11842           if (speed)
11843             *cost += extra_cost->vect.alu;
11844           return true;
11845         }
11846
11847       if (code == AND
11848           && GET_CODE (op0) == MULT
11849           && CONST_INT_P (XEXP (op0, 1))
11850           && CONST_INT_P (op1)
11851           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
11852                                INTVAL (op1)) != 0)
11853         {
11854           /* This is a UBFM/SBFM.  */
11855           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
11856           if (speed)
11857             *cost += extra_cost->alu.bfx;
11858           return true;
11859         }
11860
11861       if (is_int_mode (mode, &int_mode))
11862         {
11863           if (CONST_INT_P (op1))
11864             {
11865               /* We have a mask + shift version of a UBFIZ
11866                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
11867               if (GET_CODE (op0) == ASHIFT
11868                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
11869                                                          XEXP (op0, 1)))
11870                 {
11871                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
11872                                      (enum rtx_code) code, 0, speed);
11873                   if (speed)
11874                     *cost += extra_cost->alu.bfx;
11875
11876                   return true;
11877                 }
11878               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
11879                 {
11880                 /* We possibly get the immediate for free, this is not
11881                    modelled.  */
11882                   *cost += rtx_cost (op0, int_mode,
11883                                      (enum rtx_code) code, 0, speed);
11884                   if (speed)
11885                     *cost += extra_cost->alu.logical;
11886
11887                   return true;
11888                 }
11889             }
11890           else
11891             {
11892               rtx new_op0 = op0;
11893
11894               /* Handle ORN, EON, or BIC.  */
11895               if (GET_CODE (op0) == NOT)
11896                 op0 = XEXP (op0, 0);
11897
11898               new_op0 = aarch64_strip_shift (op0);
11899
11900               /* If we had a shift on op0 then this is a logical-shift-
11901                  by-register/immediate operation.  Otherwise, this is just
11902                  a logical operation.  */
11903               if (speed)
11904                 {
11905                   if (new_op0 != op0)
11906                     {
11907                       /* Shift by immediate.  */
11908                       if (CONST_INT_P (XEXP (op0, 1)))
11909                         *cost += extra_cost->alu.log_shift;
11910                       else
11911                         *cost += extra_cost->alu.log_shift_reg;
11912                     }
11913                   else
11914                     *cost += extra_cost->alu.logical;
11915                 }
11916
11917               /* In both cases we want to cost both operands.  */
11918               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
11919                                  0, speed);
11920               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
11921                                  1, speed);
11922
11923               return true;
11924             }
11925         }
11926       return false;
11927
11928     case NOT:
11929       x = XEXP (x, 0);
11930       op0 = aarch64_strip_shift (x);
11931
11932       if (VECTOR_MODE_P (mode))
11933         {
11934           /* Vector NOT.  */
11935           *cost += extra_cost->vect.alu;
11936           return false;
11937         }
11938
11939       /* MVN-shifted-reg.  */
11940       if (op0 != x)
11941         {
11942           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
11943
11944           if (speed)
11945             *cost += extra_cost->alu.log_shift;
11946
11947           return true;
11948         }
11949       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
11950          Handle the second form here taking care that 'a' in the above can
11951          be a shift.  */
11952       else if (GET_CODE (op0) == XOR)
11953         {
11954           rtx newop0 = XEXP (op0, 0);
11955           rtx newop1 = XEXP (op0, 1);
11956           rtx op0_stripped = aarch64_strip_shift (newop0);
11957
11958           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
11959           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
11960
11961           if (speed)
11962             {
11963               if (op0_stripped != newop0)
11964                 *cost += extra_cost->alu.log_shift;
11965               else
11966                 *cost += extra_cost->alu.logical;
11967             }
11968
11969           return true;
11970         }
11971       /* MVN.  */
11972       if (speed)
11973         *cost += extra_cost->alu.logical;
11974
11975       return false;
11976
11977     case ZERO_EXTEND:
11978
11979       op0 = XEXP (x, 0);
11980       /* If a value is written in SI mode, then zero extended to DI
11981          mode, the operation will in general be free as a write to
11982          a 'w' register implicitly zeroes the upper bits of an 'x'
11983          register.  However, if this is
11984
11985            (set (reg) (zero_extend (reg)))
11986
11987          we must cost the explicit register move.  */
11988       if (mode == DImode
11989           && GET_MODE (op0) == SImode
11990           && outer == SET)
11991         {
11992           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
11993
11994         /* If OP_COST is non-zero, then the cost of the zero extend
11995            is effectively the cost of the inner operation.  Otherwise
11996            we have a MOV instruction and we take the cost from the MOV
11997            itself.  This is true independently of whether we are
11998            optimizing for space or time.  */
11999           if (op_cost)
12000             *cost = op_cost;
12001
12002           return true;
12003         }
12004       else if (MEM_P (op0))
12005         {
12006           /* All loads can zero extend to any size for free.  */
12007           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
12008           return true;
12009         }
12010
12011       op0 = aarch64_extend_bitfield_pattern_p (x);
12012       if (op0)
12013         {
12014           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
12015           if (speed)
12016             *cost += extra_cost->alu.bfx;
12017           return true;
12018         }
12019
12020       if (speed)
12021         {
12022           if (VECTOR_MODE_P (mode))
12023             {
12024               /* UMOV.  */
12025               *cost += extra_cost->vect.alu;
12026             }
12027           else
12028             {
12029               /* We generate an AND instead of UXTB/UXTH.  */
12030               *cost += extra_cost->alu.logical;
12031             }
12032         }
12033       return false;
12034
12035     case SIGN_EXTEND:
12036       if (MEM_P (XEXP (x, 0)))
12037         {
12038           /* LDRSH.  */
12039           if (speed)
12040             {
12041               rtx address = XEXP (XEXP (x, 0), 0);
12042               *cost += extra_cost->ldst.load_sign_extend;
12043
12044               *cost +=
12045                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
12046                                                      0, speed));
12047             }
12048           return true;
12049         }
12050
12051       op0 = aarch64_extend_bitfield_pattern_p (x);
12052       if (op0)
12053         {
12054           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
12055           if (speed)
12056             *cost += extra_cost->alu.bfx;
12057           return true;
12058         }
12059
12060       if (speed)
12061         {
12062           if (VECTOR_MODE_P (mode))
12063             *cost += extra_cost->vect.alu;
12064           else
12065             *cost += extra_cost->alu.extend;
12066         }
12067       return false;
12068
12069     case ASHIFT:
12070       op0 = XEXP (x, 0);
12071       op1 = XEXP (x, 1);
12072
12073       if (CONST_INT_P (op1))
12074         {
12075           if (speed)
12076             {
12077               if (VECTOR_MODE_P (mode))
12078                 {
12079                   /* Vector shift (immediate).  */
12080                   *cost += extra_cost->vect.alu;
12081                 }
12082               else
12083                 {
12084                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
12085                      aliases.  */
12086                   *cost += extra_cost->alu.shift;
12087                 }
12088             }
12089
12090           /* We can incorporate zero/sign extend for free.  */
12091           if (GET_CODE (op0) == ZERO_EXTEND
12092               || GET_CODE (op0) == SIGN_EXTEND)
12093             op0 = XEXP (op0, 0);
12094
12095           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
12096           return true;
12097         }
12098       else
12099         {
12100           if (VECTOR_MODE_P (mode))
12101             {
12102               if (speed)
12103                 /* Vector shift (register).  */
12104                 *cost += extra_cost->vect.alu;
12105             }
12106           else
12107             {
12108               if (speed)
12109                 /* LSLV.  */
12110                 *cost += extra_cost->alu.shift_reg;
12111
12112               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
12113                   && CONST_INT_P (XEXP (op1, 1))
12114                   && known_eq (INTVAL (XEXP (op1, 1)),
12115                                GET_MODE_BITSIZE (mode) - 1))
12116                 {
12117                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
12118                   /* We already demanded XEXP (op1, 0) to be REG_P, so
12119                      don't recurse into it.  */
12120                   return true;
12121                 }
12122             }
12123           return false;  /* All arguments need to be in registers.  */
12124         }
12125
12126     case ROTATE:
12127     case ROTATERT:
12128     case LSHIFTRT:
12129     case ASHIFTRT:
12130       op0 = XEXP (x, 0);
12131       op1 = XEXP (x, 1);
12132
12133       if (CONST_INT_P (op1))
12134         {
12135           /* ASR (immediate) and friends.  */
12136           if (speed)
12137             {
12138               if (VECTOR_MODE_P (mode))
12139                 *cost += extra_cost->vect.alu;
12140               else
12141                 *cost += extra_cost->alu.shift;
12142             }
12143
12144           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
12145           return true;
12146         }
12147       else
12148         {
12149           if (VECTOR_MODE_P (mode))
12150             {
12151               if (speed)
12152                 /* Vector shift (register).  */
12153                 *cost += extra_cost->vect.alu;
12154             }
12155           else
12156             {
12157               if (speed)
12158                 /* ASR (register) and friends.  */
12159                 *cost += extra_cost->alu.shift_reg;
12160
12161               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
12162                   && CONST_INT_P (XEXP (op1, 1))
12163                   && known_eq (INTVAL (XEXP (op1, 1)),
12164                                GET_MODE_BITSIZE (mode) - 1))
12165                 {
12166                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
12167                   /* We already demanded XEXP (op1, 0) to be REG_P, so
12168                      don't recurse into it.  */
12169                   return true;
12170                 }
12171             }
12172           return false;  /* All arguments need to be in registers.  */
12173         }
12174
12175     case SYMBOL_REF:
12176
12177       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
12178           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
12179         {
12180           /* LDR.  */
12181           if (speed)
12182             *cost += extra_cost->ldst.load;
12183         }
12184       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
12185                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
12186         {
12187           /* ADRP, followed by ADD.  */
12188           *cost += COSTS_N_INSNS (1);
12189           if (speed)
12190             *cost += 2 * extra_cost->alu.arith;
12191         }
12192       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
12193                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
12194         {
12195           /* ADR.  */
12196           if (speed)
12197             *cost += extra_cost->alu.arith;
12198         }
12199
12200       if (flag_pic)
12201         {
12202           /* One extra load instruction, after accessing the GOT.  */
12203           *cost += COSTS_N_INSNS (1);
12204           if (speed)
12205             *cost += extra_cost->ldst.load;
12206         }
12207       return true;
12208
12209     case HIGH:
12210     case LO_SUM:
12211       /* ADRP/ADD (immediate).  */
12212       if (speed)
12213         *cost += extra_cost->alu.arith;
12214       return true;
12215
12216     case ZERO_EXTRACT:
12217     case SIGN_EXTRACT:
12218       /* UBFX/SBFX.  */
12219       if (speed)
12220         {
12221           if (VECTOR_MODE_P (mode))
12222             *cost += extra_cost->vect.alu;
12223           else
12224             *cost += extra_cost->alu.bfx;
12225         }
12226
12227       /* We can trust that the immediates used will be correct (there
12228          are no by-register forms), so we need only cost op0.  */
12229       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
12230       return true;
12231
12232     case MULT:
12233       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
12234       /* aarch64_rtx_mult_cost always handles recursion to its
12235          operands.  */
12236       return true;
12237
12238     case MOD:
12239     /* We can expand signed mod by power of 2 using a NEGS, two parallel
12240        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
12241        an unconditional negate.  This case should only ever be reached through
12242        the set_smod_pow2_cheap check in expmed.c.  */
12243       if (CONST_INT_P (XEXP (x, 1))
12244           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
12245           && (mode == SImode || mode == DImode))
12246         {
12247           /* We expand to 4 instructions.  Reset the baseline.  */
12248           *cost = COSTS_N_INSNS (4);
12249
12250           if (speed)
12251             *cost += 2 * extra_cost->alu.logical
12252                      + 2 * extra_cost->alu.arith;
12253
12254           return true;
12255         }
12256
12257     /* Fall-through.  */
12258     case UMOD:
12259       if (speed)
12260         {
12261           /* Slighly prefer UMOD over SMOD.  */
12262           if (VECTOR_MODE_P (mode))
12263             *cost += extra_cost->vect.alu;
12264           else if (GET_MODE_CLASS (mode) == MODE_INT)
12265             *cost += (extra_cost->mult[mode == DImode].add
12266                       + extra_cost->mult[mode == DImode].idiv
12267                       + (code == MOD ? 1 : 0));
12268         }
12269       return false;  /* All arguments need to be in registers.  */
12270
12271     case DIV:
12272     case UDIV:
12273     case SQRT:
12274       if (speed)
12275         {
12276           if (VECTOR_MODE_P (mode))
12277             *cost += extra_cost->vect.alu;
12278           else if (GET_MODE_CLASS (mode) == MODE_INT)
12279             /* There is no integer SQRT, so only DIV and UDIV can get
12280                here.  */
12281             *cost += (extra_cost->mult[mode == DImode].idiv
12282                      /* Slighly prefer UDIV over SDIV.  */
12283                      + (code == DIV ? 1 : 0));
12284           else
12285             *cost += extra_cost->fp[mode == DFmode].div;
12286         }
12287       return false;  /* All arguments need to be in registers.  */
12288
12289     case IF_THEN_ELSE:
12290       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
12291                                          XEXP (x, 2), cost, speed);
12292
12293     case EQ:
12294     case NE:
12295     case GT:
12296     case GTU:
12297     case LT:
12298     case LTU:
12299     case GE:
12300     case GEU:
12301     case LE:
12302     case LEU:
12303
12304       return false; /* All arguments must be in registers.  */
12305
12306     case FMA:
12307       op0 = XEXP (x, 0);
12308       op1 = XEXP (x, 1);
12309       op2 = XEXP (x, 2);
12310
12311       if (speed)
12312         {
12313           if (VECTOR_MODE_P (mode))
12314             *cost += extra_cost->vect.alu;
12315           else
12316             *cost += extra_cost->fp[mode == DFmode].fma;
12317         }
12318
12319       /* FMSUB, FNMADD, and FNMSUB are free.  */
12320       if (GET_CODE (op0) == NEG)
12321         op0 = XEXP (op0, 0);
12322
12323       if (GET_CODE (op2) == NEG)
12324         op2 = XEXP (op2, 0);
12325
12326       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
12327          and the by-element operand as operand 0.  */
12328       if (GET_CODE (op1) == NEG)
12329         op1 = XEXP (op1, 0);
12330
12331       /* Catch vector-by-element operations.  The by-element operand can
12332          either be (vec_duplicate (vec_select (x))) or just
12333          (vec_select (x)), depending on whether we are multiplying by
12334          a vector or a scalar.
12335
12336          Canonicalization is not very good in these cases, FMA4 will put the
12337          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
12338       if (GET_CODE (op0) == VEC_DUPLICATE)
12339         op0 = XEXP (op0, 0);
12340       else if (GET_CODE (op1) == VEC_DUPLICATE)
12341         op1 = XEXP (op1, 0);
12342
12343       if (GET_CODE (op0) == VEC_SELECT)
12344         op0 = XEXP (op0, 0);
12345       else if (GET_CODE (op1) == VEC_SELECT)
12346         op1 = XEXP (op1, 0);
12347
12348       /* If the remaining parameters are not registers,
12349          get the cost to put them into registers.  */
12350       *cost += rtx_cost (op0, mode, FMA, 0, speed);
12351       *cost += rtx_cost (op1, mode, FMA, 1, speed);
12352       *cost += rtx_cost (op2, mode, FMA, 2, speed);
12353       return true;
12354
12355     case FLOAT:
12356     case UNSIGNED_FLOAT:
12357       if (speed)
12358         *cost += extra_cost->fp[mode == DFmode].fromint;
12359       return false;
12360
12361     case FLOAT_EXTEND:
12362       if (speed)
12363         {
12364           if (VECTOR_MODE_P (mode))
12365             {
12366               /*Vector truncate.  */
12367               *cost += extra_cost->vect.alu;
12368             }
12369           else
12370             *cost += extra_cost->fp[mode == DFmode].widen;
12371         }
12372       return false;
12373
12374     case FLOAT_TRUNCATE:
12375       if (speed)
12376         {
12377           if (VECTOR_MODE_P (mode))
12378             {
12379               /*Vector conversion.  */
12380               *cost += extra_cost->vect.alu;
12381             }
12382           else
12383             *cost += extra_cost->fp[mode == DFmode].narrow;
12384         }
12385       return false;
12386
12387     case FIX:
12388     case UNSIGNED_FIX:
12389       x = XEXP (x, 0);
12390       /* Strip the rounding part.  They will all be implemented
12391          by the fcvt* family of instructions anyway.  */
12392       if (GET_CODE (x) == UNSPEC)
12393         {
12394           unsigned int uns_code = XINT (x, 1);
12395
12396           if (uns_code == UNSPEC_FRINTA
12397               || uns_code == UNSPEC_FRINTM
12398               || uns_code == UNSPEC_FRINTN
12399               || uns_code == UNSPEC_FRINTP
12400               || uns_code == UNSPEC_FRINTZ)
12401             x = XVECEXP (x, 0, 0);
12402         }
12403
12404       if (speed)
12405         {
12406           if (VECTOR_MODE_P (mode))
12407             *cost += extra_cost->vect.alu;
12408           else
12409             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
12410         }
12411
12412       /* We can combine fmul by a power of 2 followed by a fcvt into a single
12413          fixed-point fcvt.  */
12414       if (GET_CODE (x) == MULT
12415           && ((VECTOR_MODE_P (mode)
12416                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
12417               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
12418         {
12419           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
12420                              0, speed);
12421           return true;
12422         }
12423
12424       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
12425       return true;
12426
12427     case ABS:
12428       if (VECTOR_MODE_P (mode))
12429         {
12430           /* ABS (vector).  */
12431           if (speed)
12432             *cost += extra_cost->vect.alu;
12433         }
12434       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12435         {
12436           op0 = XEXP (x, 0);
12437
12438           /* FABD, which is analogous to FADD.  */
12439           if (GET_CODE (op0) == MINUS)
12440             {
12441               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
12442               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
12443               if (speed)
12444                 *cost += extra_cost->fp[mode == DFmode].addsub;
12445
12446               return true;
12447             }
12448           /* Simple FABS is analogous to FNEG.  */
12449           if (speed)
12450             *cost += extra_cost->fp[mode == DFmode].neg;
12451         }
12452       else
12453         {
12454           /* Integer ABS will either be split to
12455              two arithmetic instructions, or will be an ABS
12456              (scalar), which we don't model.  */
12457           *cost = COSTS_N_INSNS (2);
12458           if (speed)
12459             *cost += 2 * extra_cost->alu.arith;
12460         }
12461       return false;
12462
12463     case SMAX:
12464     case SMIN:
12465       if (speed)
12466         {
12467           if (VECTOR_MODE_P (mode))
12468             *cost += extra_cost->vect.alu;
12469           else
12470             {
12471               /* FMAXNM/FMINNM/FMAX/FMIN.
12472                  TODO: This may not be accurate for all implementations, but
12473                  we do not model this in the cost tables.  */
12474               *cost += extra_cost->fp[mode == DFmode].addsub;
12475             }
12476         }
12477       return false;
12478
12479     case UNSPEC:
12480       /* The floating point round to integer frint* instructions.  */
12481       if (aarch64_frint_unspec_p (XINT (x, 1)))
12482         {
12483           if (speed)
12484             *cost += extra_cost->fp[mode == DFmode].roundint;
12485
12486           return false;
12487         }
12488
12489       if (XINT (x, 1) == UNSPEC_RBIT)
12490         {
12491           if (speed)
12492             *cost += extra_cost->alu.rev;
12493
12494           return false;
12495         }
12496       break;
12497
12498     case TRUNCATE:
12499
12500       /* Decompose <su>muldi3_highpart.  */
12501       if (/* (truncate:DI  */
12502           mode == DImode
12503           /*   (lshiftrt:TI  */
12504           && GET_MODE (XEXP (x, 0)) == TImode
12505           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
12506           /*      (mult:TI  */
12507           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
12508           /*        (ANY_EXTEND:TI (reg:DI))
12509                     (ANY_EXTEND:TI (reg:DI)))  */
12510           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
12511                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
12512               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
12513                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
12514           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
12515           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
12516           /*     (const_int 64)  */
12517           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
12518           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
12519         {
12520           /* UMULH/SMULH.  */
12521           if (speed)
12522             *cost += extra_cost->mult[mode == DImode].extend;
12523           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
12524                              mode, MULT, 0, speed);
12525           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
12526                              mode, MULT, 1, speed);
12527           return true;
12528         }
12529
12530       /* Fall through.  */
12531     default:
12532       break;
12533     }
12534
12535   if (dump_file
12536       && flag_aarch64_verbose_cost)
12537     fprintf (dump_file,
12538       "\nFailed to cost RTX.  Assuming default cost.\n");
12539
12540   return true;
12541 }
12542
12543 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
12544    calculated for X.  This cost is stored in *COST.  Returns true
12545    if the total cost of X was calculated.  */
12546 static bool
12547 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
12548                    int param, int *cost, bool speed)
12549 {
12550   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
12551
12552   if (dump_file
12553       && flag_aarch64_verbose_cost)
12554     {
12555       print_rtl_single (dump_file, x);
12556       fprintf (dump_file, "\n%s cost: %d (%s)\n",
12557                speed ? "Hot" : "Cold",
12558                *cost, result ? "final" : "partial");
12559     }
12560
12561   return result;
12562 }
12563
12564 static int
12565 aarch64_register_move_cost (machine_mode mode,
12566                             reg_class_t from_i, reg_class_t to_i)
12567 {
12568   enum reg_class from = (enum reg_class) from_i;
12569   enum reg_class to = (enum reg_class) to_i;
12570   const struct cpu_regmove_cost *regmove_cost
12571     = aarch64_tune_params.regmove_cost;
12572
12573   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
12574   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
12575     to = GENERAL_REGS;
12576
12577   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
12578     from = GENERAL_REGS;
12579
12580   /* Make RDFFR very expensive.  In particular, if we know that the FFR
12581      contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
12582      as a way of obtaining a PTRUE.  */
12583   if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
12584       && hard_reg_set_subset_p (reg_class_contents[from_i],
12585                                 reg_class_contents[FFR_REGS]))
12586     return 80;
12587
12588   /* Moving between GPR and stack cost is the same as GP2GP.  */
12589   if ((from == GENERAL_REGS && to == STACK_REG)
12590       || (to == GENERAL_REGS && from == STACK_REG))
12591     return regmove_cost->GP2GP;
12592
12593   /* To/From the stack register, we move via the gprs.  */
12594   if (to == STACK_REG || from == STACK_REG)
12595     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
12596             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
12597
12598   if (known_eq (GET_MODE_SIZE (mode), 16))
12599     {
12600       /* 128-bit operations on general registers require 2 instructions.  */
12601       if (from == GENERAL_REGS && to == GENERAL_REGS)
12602         return regmove_cost->GP2GP * 2;
12603       else if (from == GENERAL_REGS)
12604         return regmove_cost->GP2FP * 2;
12605       else if (to == GENERAL_REGS)
12606         return regmove_cost->FP2GP * 2;
12607
12608       /* When AdvSIMD instructions are disabled it is not possible to move
12609          a 128-bit value directly between Q registers.  This is handled in
12610          secondary reload.  A general register is used as a scratch to move
12611          the upper DI value and the lower DI value is moved directly,
12612          hence the cost is the sum of three moves. */
12613       if (! TARGET_SIMD)
12614         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
12615
12616       return regmove_cost->FP2FP;
12617     }
12618
12619   if (from == GENERAL_REGS && to == GENERAL_REGS)
12620     return regmove_cost->GP2GP;
12621   else if (from == GENERAL_REGS)
12622     return regmove_cost->GP2FP;
12623   else if (to == GENERAL_REGS)
12624     return regmove_cost->FP2GP;
12625
12626   return regmove_cost->FP2FP;
12627 }
12628
12629 static int
12630 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
12631                           reg_class_t rclass ATTRIBUTE_UNUSED,
12632                           bool in ATTRIBUTE_UNUSED)
12633 {
12634   return aarch64_tune_params.memmov_cost;
12635 }
12636
12637 /* Implement TARGET_INIT_BUILTINS.  */
12638 static void
12639 aarch64_init_builtins ()
12640 {
12641   aarch64_general_init_builtins ();
12642   aarch64_sve::init_builtins ();
12643 }
12644
12645 /* Implement TARGET_FOLD_BUILTIN.  */
12646 static tree
12647 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
12648 {
12649   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
12650   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12651   tree type = TREE_TYPE (TREE_TYPE (fndecl));
12652   switch (code & AARCH64_BUILTIN_CLASS)
12653     {
12654     case AARCH64_BUILTIN_GENERAL:
12655       return aarch64_general_fold_builtin (subcode, type, nargs, args);
12656
12657     case AARCH64_BUILTIN_SVE:
12658       return NULL_TREE;
12659     }
12660   gcc_unreachable ();
12661 }
12662
12663 /* Implement TARGET_GIMPLE_FOLD_BUILTIN.  */
12664 static bool
12665 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
12666 {
12667   gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
12668   tree fndecl = gimple_call_fndecl (stmt);
12669   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
12670   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12671   gimple *new_stmt = NULL;
12672   switch (code & AARCH64_BUILTIN_CLASS)
12673     {
12674     case AARCH64_BUILTIN_GENERAL:
12675       new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt);
12676       break;
12677
12678     case AARCH64_BUILTIN_SVE:
12679       new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
12680       break;
12681     }
12682
12683   if (!new_stmt)
12684     return false;
12685
12686   gsi_replace (gsi, new_stmt, true);
12687   return true;
12688 }
12689
12690 /* Implement TARGET_EXPAND_BUILTIN.  */
12691 static rtx
12692 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
12693 {
12694   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
12695   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
12696   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12697   switch (code & AARCH64_BUILTIN_CLASS)
12698     {
12699     case AARCH64_BUILTIN_GENERAL:
12700       return aarch64_general_expand_builtin (subcode, exp, target, ignore);
12701
12702     case AARCH64_BUILTIN_SVE:
12703       return aarch64_sve::expand_builtin (subcode, exp, target);
12704     }
12705   gcc_unreachable ();
12706 }
12707
12708 /* Implement TARGET_BUILTIN_DECL.  */
12709 static tree
12710 aarch64_builtin_decl (unsigned int code, bool initialize_p)
12711 {
12712   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12713   switch (code & AARCH64_BUILTIN_CLASS)
12714     {
12715     case AARCH64_BUILTIN_GENERAL:
12716       return aarch64_general_builtin_decl (subcode, initialize_p);
12717
12718     case AARCH64_BUILTIN_SVE:
12719       return aarch64_sve::builtin_decl (subcode, initialize_p);
12720     }
12721   gcc_unreachable ();
12722 }
12723
12724 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
12725    to optimize 1.0/sqrt.  */
12726
12727 static bool
12728 use_rsqrt_p (machine_mode mode)
12729 {
12730   return (!flag_trapping_math
12731           && flag_unsafe_math_optimizations
12732           && ((aarch64_tune_params.approx_modes->recip_sqrt
12733                & AARCH64_APPROX_MODE (mode))
12734               || flag_mrecip_low_precision_sqrt));
12735 }
12736
12737 /* Function to decide when to use the approximate reciprocal square root
12738    builtin.  */
12739
12740 static tree
12741 aarch64_builtin_reciprocal (tree fndecl)
12742 {
12743   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
12744
12745   if (!use_rsqrt_p (mode))
12746     return NULL_TREE;
12747   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
12748   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12749   switch (code & AARCH64_BUILTIN_CLASS)
12750     {
12751     case AARCH64_BUILTIN_GENERAL:
12752       return aarch64_general_builtin_rsqrt (subcode);
12753
12754     case AARCH64_BUILTIN_SVE:
12755       return NULL_TREE;
12756     }
12757   gcc_unreachable ();
12758 }
12759
12760 /* Emit code to perform the floating-point operation:
12761
12762      DST = SRC1 * SRC2
12763
12764    where all three operands are already known to be registers.
12765    If the operation is an SVE one, PTRUE is a suitable all-true
12766    predicate.  */
12767
12768 static void
12769 aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
12770 {
12771   if (ptrue)
12772     emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
12773                                  dst, ptrue, src1, src2,
12774                                  gen_int_mode (SVE_RELAXED_GP, SImode)));
12775   else
12776     emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
12777 }
12778
12779 /* Emit instruction sequence to compute either the approximate square root
12780    or its approximate reciprocal, depending on the flag RECP, and return
12781    whether the sequence was emitted or not.  */
12782
12783 bool
12784 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
12785 {
12786   machine_mode mode = GET_MODE (dst);
12787
12788   if (GET_MODE_INNER (mode) == HFmode)
12789     {
12790       gcc_assert (!recp);
12791       return false;
12792     }
12793
12794   if (!recp)
12795     {
12796       if (!(flag_mlow_precision_sqrt
12797             || (aarch64_tune_params.approx_modes->sqrt
12798                 & AARCH64_APPROX_MODE (mode))))
12799         return false;
12800
12801       if (!flag_finite_math_only
12802           || flag_trapping_math
12803           || !flag_unsafe_math_optimizations
12804           || optimize_function_for_size_p (cfun))
12805         return false;
12806     }
12807   else
12808     /* Caller assumes we cannot fail.  */
12809     gcc_assert (use_rsqrt_p (mode));
12810
12811   rtx pg = NULL_RTX;
12812   if (aarch64_sve_mode_p (mode))
12813     pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
12814   machine_mode mmsk = (VECTOR_MODE_P (mode)
12815                        ? related_int_vector_mode (mode).require ()
12816                        : int_mode_for_mode (mode).require ());
12817   rtx xmsk = NULL_RTX;
12818   if (!recp)
12819     {
12820       /* When calculating the approximate square root, compare the
12821          argument with 0.0 and create a mask.  */
12822       rtx zero = CONST0_RTX (mode);
12823       if (pg)
12824         {
12825           xmsk = gen_reg_rtx (GET_MODE (pg));
12826           rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
12827           emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
12828                                            xmsk, pg, hint, src, zero));
12829         }
12830       else
12831         {
12832           xmsk = gen_reg_rtx (mmsk);
12833           emit_insn (gen_rtx_SET (xmsk,
12834                                   gen_rtx_NEG (mmsk,
12835                                                gen_rtx_EQ (mmsk, src, zero))));
12836         }
12837     }
12838
12839   /* Estimate the approximate reciprocal square root.  */
12840   rtx xdst = gen_reg_rtx (mode);
12841   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
12842
12843   /* Iterate over the series twice for SF and thrice for DF.  */
12844   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
12845
12846   /* Optionally iterate over the series once less for faster performance
12847      while sacrificing the accuracy.  */
12848   if ((recp && flag_mrecip_low_precision_sqrt)
12849       || (!recp && flag_mlow_precision_sqrt))
12850     iterations--;
12851
12852   /* Iterate over the series to calculate the approximate reciprocal square
12853      root.  */
12854   rtx x1 = gen_reg_rtx (mode);
12855   while (iterations--)
12856     {
12857       rtx x2 = gen_reg_rtx (mode);
12858       aarch64_emit_mult (x2, pg, xdst, xdst);
12859
12860       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
12861
12862       if (iterations > 0)
12863         aarch64_emit_mult (xdst, pg, xdst, x1);
12864     }
12865
12866   if (!recp)
12867     {
12868       if (pg)
12869         /* Multiply nonzero source values by the corresponding intermediate
12870            result elements, so that the final calculation is the approximate
12871            square root rather than its reciprocal.  Select a zero result for
12872            zero source values, to avoid the Inf * 0 -> NaN that we'd get
12873            otherwise.  */
12874         emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
12875                              xdst, xmsk, xdst, src, CONST0_RTX (mode)));
12876       else
12877         {
12878           /* Qualify the approximate reciprocal square root when the
12879              argument is 0.0 by squashing the intermediary result to 0.0.  */
12880           rtx xtmp = gen_reg_rtx (mmsk);
12881           emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
12882                                             gen_rtx_SUBREG (mmsk, xdst, 0)));
12883           emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
12884
12885           /* Calculate the approximate square root.  */
12886           aarch64_emit_mult (xdst, pg, xdst, src);
12887         }
12888     }
12889
12890   /* Finalize the approximation.  */
12891   aarch64_emit_mult (dst, pg, xdst, x1);
12892
12893   return true;
12894 }
12895
12896 /* Emit the instruction sequence to compute the approximation for the division
12897    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
12898
12899 bool
12900 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
12901 {
12902   machine_mode mode = GET_MODE (quo);
12903
12904   if (GET_MODE_INNER (mode) == HFmode)
12905     return false;
12906
12907   bool use_approx_division_p = (flag_mlow_precision_div
12908                                 || (aarch64_tune_params.approx_modes->division
12909                                     & AARCH64_APPROX_MODE (mode)));
12910
12911   if (!flag_finite_math_only
12912       || flag_trapping_math
12913       || !flag_unsafe_math_optimizations
12914       || optimize_function_for_size_p (cfun)
12915       || !use_approx_division_p)
12916     return false;
12917
12918   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
12919     return false;
12920
12921   rtx pg = NULL_RTX;
12922   if (aarch64_sve_mode_p (mode))
12923     pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
12924
12925   /* Estimate the approximate reciprocal.  */
12926   rtx xrcp = gen_reg_rtx (mode);
12927   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
12928
12929   /* Iterate over the series twice for SF and thrice for DF.  */
12930   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
12931
12932   /* Optionally iterate over the series less for faster performance,
12933      while sacrificing the accuracy.  The default is 2 for DF and 1 for SF.  */
12934   if (flag_mlow_precision_div)
12935     iterations = (GET_MODE_INNER (mode) == DFmode
12936                   ? aarch64_double_recp_precision
12937                   : aarch64_float_recp_precision);
12938
12939   /* Iterate over the series to calculate the approximate reciprocal.  */
12940   rtx xtmp = gen_reg_rtx (mode);
12941   while (iterations--)
12942     {
12943       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
12944
12945       if (iterations > 0)
12946         aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
12947     }
12948
12949   if (num != CONST1_RTX (mode))
12950     {
12951       /* As the approximate reciprocal of DEN is already calculated, only
12952          calculate the approximate division when NUM is not 1.0.  */
12953       rtx xnum = force_reg (mode, num);
12954       aarch64_emit_mult (xrcp, pg, xrcp, xnum);
12955     }
12956
12957   /* Finalize the approximation.  */
12958   aarch64_emit_mult (quo, pg, xrcp, xtmp);
12959   return true;
12960 }
12961
12962 /* Return the number of instructions that can be issued per cycle.  */
12963 static int
12964 aarch64_sched_issue_rate (void)
12965 {
12966   return aarch64_tune_params.issue_rate;
12967 }
12968
12969 /* Implement TARGET_SCHED_VARIABLE_ISSUE.  */
12970 static int
12971 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
12972 {
12973   if (DEBUG_INSN_P (insn))
12974     return more;
12975
12976   rtx_code code = GET_CODE (PATTERN (insn));
12977   if (code == USE || code == CLOBBER)
12978     return more;
12979
12980   if (get_attr_type (insn) == TYPE_NO_INSN)
12981     return more;
12982
12983   return more - 1;
12984 }
12985
12986 static int
12987 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
12988 {
12989   int issue_rate = aarch64_sched_issue_rate ();
12990
12991   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
12992 }
12993
12994
12995 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
12996    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
12997    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
12998
12999 static int
13000 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
13001                                                     int ready_index)
13002 {
13003   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
13004 }
13005
13006
13007 /* Vectorizer cost model target hooks.  */
13008
13009 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
13010 static int
13011 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
13012                                     tree vectype,
13013                                     int misalign ATTRIBUTE_UNUSED)
13014 {
13015   unsigned elements;
13016   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
13017   bool fp = false;
13018
13019   if (vectype != NULL)
13020     fp = FLOAT_TYPE_P (vectype);
13021
13022   switch (type_of_cost)
13023     {
13024       case scalar_stmt:
13025         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
13026
13027       case scalar_load:
13028         return costs->scalar_load_cost;
13029
13030       case scalar_store:
13031         return costs->scalar_store_cost;
13032
13033       case vector_stmt:
13034         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
13035
13036       case vector_load:
13037         return costs->vec_align_load_cost;
13038
13039       case vector_store:
13040         return costs->vec_store_cost;
13041
13042       case vec_to_scalar:
13043         return costs->vec_to_scalar_cost;
13044
13045       case scalar_to_vec:
13046         return costs->scalar_to_vec_cost;
13047
13048       case unaligned_load:
13049       case vector_gather_load:
13050         return costs->vec_unalign_load_cost;
13051
13052       case unaligned_store:
13053       case vector_scatter_store:
13054         return costs->vec_unalign_store_cost;
13055
13056       case cond_branch_taken:
13057         return costs->cond_taken_branch_cost;
13058
13059       case cond_branch_not_taken:
13060         return costs->cond_not_taken_branch_cost;
13061
13062       case vec_perm:
13063         return costs->vec_permute_cost;
13064
13065       case vec_promote_demote:
13066         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
13067
13068       case vec_construct:
13069         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
13070         return elements / 2 + 1;
13071
13072       default:
13073         gcc_unreachable ();
13074     }
13075 }
13076
13077 /* Return true if STMT_INFO extends the result of a load.  */
13078 static bool
13079 aarch64_extending_load_p (stmt_vec_info stmt_info)
13080 {
13081   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
13082   if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
13083     return false;
13084
13085   tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
13086   tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
13087   tree rhs_type = TREE_TYPE (rhs);
13088   if (!INTEGRAL_TYPE_P (lhs_type)
13089       || !INTEGRAL_TYPE_P (rhs_type)
13090       || TYPE_PRECISION (lhs_type) <= TYPE_PRECISION (rhs_type))
13091     return false;
13092
13093   stmt_vec_info def_stmt_info = stmt_info->vinfo->lookup_def (rhs);
13094   return (def_stmt_info
13095           && STMT_VINFO_DATA_REF (def_stmt_info)
13096           && DR_IS_READ (STMT_VINFO_DATA_REF (def_stmt_info)));
13097 }
13098
13099 /* Return true if STMT_INFO is an integer truncation.  */
13100 static bool
13101 aarch64_integer_truncation_p (stmt_vec_info stmt_info)
13102 {
13103   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
13104   if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
13105     return false;
13106
13107   tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
13108   tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
13109   return (INTEGRAL_TYPE_P (lhs_type)
13110           && INTEGRAL_TYPE_P (rhs_type)
13111           && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type));
13112 }
13113
13114 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
13115    for STMT_INFO, which has cost kind KIND.  Adjust the cost as necessary
13116    for SVE targets.  */
13117 static unsigned int
13118 aarch64_sve_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
13119                               unsigned int stmt_cost)
13120 {
13121   /* Unlike vec_promote_demote, vector_stmt conversions do not change the
13122      vector register size or number of units.  Integer promotions of this
13123      type therefore map to SXT[BHW] or UXT[BHW].
13124
13125      Most loads have extending forms that can do the sign or zero extension
13126      on the fly.  Optimistically assume that a load followed by an extension
13127      will fold to this form during combine, and that the extension therefore
13128      comes for free.  */
13129   if (kind == vector_stmt && aarch64_extending_load_p (stmt_info))
13130     stmt_cost = 0;
13131
13132   /* For similar reasons, vector_stmt integer truncations are a no-op,
13133      because we can just ignore the unused upper bits of the source.  */
13134   if (kind == vector_stmt && aarch64_integer_truncation_p (stmt_info))
13135     stmt_cost = 0;
13136
13137   return stmt_cost;
13138 }
13139
13140 /* Implement targetm.vectorize.add_stmt_cost.  */
13141 static unsigned
13142 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
13143                        struct _stmt_vec_info *stmt_info, int misalign,
13144                        enum vect_cost_model_location where)
13145 {
13146   unsigned *cost = (unsigned *) data;
13147   unsigned retval = 0;
13148
13149   if (flag_vect_cost_model)
13150     {
13151       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
13152       int stmt_cost =
13153             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
13154
13155       if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
13156         stmt_cost = aarch64_sve_adjust_stmt_cost (kind, stmt_info, stmt_cost);
13157
13158       /* Statements in an inner loop relative to the loop being
13159          vectorized are weighted more heavily.  The value here is
13160          arbitrary and could potentially be improved with analysis.  */
13161       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
13162         count *= 50; /*  FIXME  */
13163
13164       retval = (unsigned) (count * stmt_cost);
13165       cost[where] += retval;
13166     }
13167
13168   return retval;
13169 }
13170
13171 static void initialize_aarch64_code_model (struct gcc_options *);
13172
13173 /* Parse the TO_PARSE string and put the architecture struct that it
13174    selects into RES and the architectural features into ISA_FLAGS.
13175    Return an aarch64_parse_opt_result describing the parse result.
13176    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
13177    When the TO_PARSE string contains an invalid extension,
13178    a copy of the string is created and stored to INVALID_EXTENSION.  */
13179
13180 static enum aarch64_parse_opt_result
13181 aarch64_parse_arch (const char *to_parse, const struct processor **res,
13182                     uint64_t *isa_flags, std::string *invalid_extension)
13183 {
13184   const char *ext;
13185   const struct processor *arch;
13186   size_t len;
13187
13188   ext = strchr (to_parse, '+');
13189
13190   if (ext != NULL)
13191     len = ext - to_parse;
13192   else
13193     len = strlen (to_parse);
13194
13195   if (len == 0)
13196     return AARCH64_PARSE_MISSING_ARG;
13197
13198
13199   /* Loop through the list of supported ARCHes to find a match.  */
13200   for (arch = all_architectures; arch->name != NULL; arch++)
13201     {
13202       if (strlen (arch->name) == len
13203           && strncmp (arch->name, to_parse, len) == 0)
13204         {
13205           uint64_t isa_temp = arch->flags;
13206
13207           if (ext != NULL)
13208             {
13209               /* TO_PARSE string contains at least one extension.  */
13210               enum aarch64_parse_opt_result ext_res
13211                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
13212
13213               if (ext_res != AARCH64_PARSE_OK)
13214                 return ext_res;
13215             }
13216           /* Extension parsing was successful.  Confirm the result
13217              arch and ISA flags.  */
13218           *res = arch;
13219           *isa_flags = isa_temp;
13220           return AARCH64_PARSE_OK;
13221         }
13222     }
13223
13224   /* ARCH name not found in list.  */
13225   return AARCH64_PARSE_INVALID_ARG;
13226 }
13227
13228 /* Parse the TO_PARSE string and put the result tuning in RES and the
13229    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
13230    describing the parse result.  If there is an error parsing, RES and
13231    ISA_FLAGS are left unchanged.
13232    When the TO_PARSE string contains an invalid extension,
13233    a copy of the string is created and stored to INVALID_EXTENSION.  */
13234
13235 static enum aarch64_parse_opt_result
13236 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
13237                    uint64_t *isa_flags, std::string *invalid_extension)
13238 {
13239   const char *ext;
13240   const struct processor *cpu;
13241   size_t len;
13242
13243   ext = strchr (to_parse, '+');
13244
13245   if (ext != NULL)
13246     len = ext - to_parse;
13247   else
13248     len = strlen (to_parse);
13249
13250   if (len == 0)
13251     return AARCH64_PARSE_MISSING_ARG;
13252
13253
13254   /* Loop through the list of supported CPUs to find a match.  */
13255   for (cpu = all_cores; cpu->name != NULL; cpu++)
13256     {
13257       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
13258         {
13259           uint64_t isa_temp = cpu->flags;
13260
13261
13262           if (ext != NULL)
13263             {
13264               /* TO_PARSE string contains at least one extension.  */
13265               enum aarch64_parse_opt_result ext_res
13266                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
13267
13268               if (ext_res != AARCH64_PARSE_OK)
13269                 return ext_res;
13270             }
13271           /* Extension parsing was successfull.  Confirm the result
13272              cpu and ISA flags.  */
13273           *res = cpu;
13274           *isa_flags = isa_temp;
13275           return AARCH64_PARSE_OK;
13276         }
13277     }
13278
13279   /* CPU name not found in list.  */
13280   return AARCH64_PARSE_INVALID_ARG;
13281 }
13282
13283 /* Parse the TO_PARSE string and put the cpu it selects into RES.
13284    Return an aarch64_parse_opt_result describing the parse result.
13285    If the parsing fails the RES does not change.  */
13286
13287 static enum aarch64_parse_opt_result
13288 aarch64_parse_tune (const char *to_parse, const struct processor **res)
13289 {
13290   const struct processor *cpu;
13291
13292   /* Loop through the list of supported CPUs to find a match.  */
13293   for (cpu = all_cores; cpu->name != NULL; cpu++)
13294     {
13295       if (strcmp (cpu->name, to_parse) == 0)
13296         {
13297           *res = cpu;
13298           return AARCH64_PARSE_OK;
13299         }
13300     }
13301
13302   /* CPU name not found in list.  */
13303   return AARCH64_PARSE_INVALID_ARG;
13304 }
13305
13306 /* Parse TOKEN, which has length LENGTH to see if it is an option
13307    described in FLAG.  If it is, return the index bit for that fusion type.
13308    If not, error (printing OPTION_NAME) and return zero.  */
13309
13310 static unsigned int
13311 aarch64_parse_one_option_token (const char *token,
13312                                 size_t length,
13313                                 const struct aarch64_flag_desc *flag,
13314                                 const char *option_name)
13315 {
13316   for (; flag->name != NULL; flag++)
13317     {
13318       if (length == strlen (flag->name)
13319           && !strncmp (flag->name, token, length))
13320         return flag->flag;
13321     }
13322
13323   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
13324   return 0;
13325 }
13326
13327 /* Parse OPTION which is a comma-separated list of flags to enable.
13328    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
13329    default state we inherit from the CPU tuning structures.  OPTION_NAME
13330    gives the top-level option we are parsing in the -moverride string,
13331    for use in error messages.  */
13332
13333 static unsigned int
13334 aarch64_parse_boolean_options (const char *option,
13335                                const struct aarch64_flag_desc *flags,
13336                                unsigned int initial_state,
13337                                const char *option_name)
13338 {
13339   const char separator = '.';
13340   const char* specs = option;
13341   const char* ntoken = option;
13342   unsigned int found_flags = initial_state;
13343
13344   while ((ntoken = strchr (specs, separator)))
13345     {
13346       size_t token_length = ntoken - specs;
13347       unsigned token_ops = aarch64_parse_one_option_token (specs,
13348                                                            token_length,
13349                                                            flags,
13350                                                            option_name);
13351       /* If we find "none" (or, for simplicity's sake, an error) anywhere
13352          in the token stream, reset the supported operations.  So:
13353
13354            adrp+add.cmp+branch.none.adrp+add
13355
13356            would have the result of turning on only adrp+add fusion.  */
13357       if (!token_ops)
13358         found_flags = 0;
13359
13360       found_flags |= token_ops;
13361       specs = ++ntoken;
13362     }
13363
13364   /* We ended with a comma, print something.  */
13365   if (!(*specs))
13366     {
13367       error ("%s string ill-formed\n", option_name);
13368       return 0;
13369     }
13370
13371   /* We still have one more token to parse.  */
13372   size_t token_length = strlen (specs);
13373   unsigned token_ops = aarch64_parse_one_option_token (specs,
13374                                                        token_length,
13375                                                        flags,
13376                                                        option_name);
13377    if (!token_ops)
13378      found_flags = 0;
13379
13380   found_flags |= token_ops;
13381   return found_flags;
13382 }
13383
13384 /* Support for overriding instruction fusion.  */
13385
13386 static void
13387 aarch64_parse_fuse_string (const char *fuse_string,
13388                             struct tune_params *tune)
13389 {
13390   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
13391                                                      aarch64_fusible_pairs,
13392                                                      tune->fusible_ops,
13393                                                      "fuse=");
13394 }
13395
13396 /* Support for overriding other tuning flags.  */
13397
13398 static void
13399 aarch64_parse_tune_string (const char *tune_string,
13400                             struct tune_params *tune)
13401 {
13402   tune->extra_tuning_flags
13403     = aarch64_parse_boolean_options (tune_string,
13404                                      aarch64_tuning_flags,
13405                                      tune->extra_tuning_flags,
13406                                      "tune=");
13407 }
13408
13409 /* Parse the sve_width tuning moverride string in TUNE_STRING.
13410    Accept the valid SVE vector widths allowed by
13411    aarch64_sve_vector_bits_enum and use it to override sve_width
13412    in TUNE.  */
13413
13414 static void
13415 aarch64_parse_sve_width_string (const char *tune_string,
13416                                 struct tune_params *tune)
13417 {
13418   int width = -1;
13419
13420   int n = sscanf (tune_string, "%d", &width);
13421   if (n == EOF)
13422     {
13423       error ("invalid format for sve_width");
13424       return;
13425     }
13426   switch (width)
13427     {
13428     case SVE_128:
13429     case SVE_256:
13430     case SVE_512:
13431     case SVE_1024:
13432     case SVE_2048:
13433       break;
13434     default:
13435       error ("invalid sve_width value: %d", width);
13436     }
13437   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
13438 }
13439
13440 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
13441    we understand.  If it is, extract the option string and handoff to
13442    the appropriate function.  */
13443
13444 void
13445 aarch64_parse_one_override_token (const char* token,
13446                                   size_t length,
13447                                   struct tune_params *tune)
13448 {
13449   const struct aarch64_tuning_override_function *fn
13450     = aarch64_tuning_override_functions;
13451
13452   const char *option_part = strchr (token, '=');
13453   if (!option_part)
13454     {
13455       error ("tuning string missing in option (%s)", token);
13456       return;
13457     }
13458
13459   /* Get the length of the option name.  */
13460   length = option_part - token;
13461   /* Skip the '=' to get to the option string.  */
13462   option_part++;
13463
13464   for (; fn->name != NULL; fn++)
13465     {
13466       if (!strncmp (fn->name, token, length))
13467         {
13468           fn->parse_override (option_part, tune);
13469           return;
13470         }
13471     }
13472
13473   error ("unknown tuning option (%s)",token);
13474   return;
13475 }
13476
13477 /* A checking mechanism for the implementation of the tls size.  */
13478
13479 static void
13480 initialize_aarch64_tls_size (struct gcc_options *opts)
13481 {
13482   if (aarch64_tls_size == 0)
13483     aarch64_tls_size = 24;
13484
13485   switch (opts->x_aarch64_cmodel_var)
13486     {
13487     case AARCH64_CMODEL_TINY:
13488       /* Both the default and maximum TLS size allowed under tiny is 1M which
13489          needs two instructions to address, so we clamp the size to 24.  */
13490       if (aarch64_tls_size > 24)
13491         aarch64_tls_size = 24;
13492       break;
13493     case AARCH64_CMODEL_SMALL:
13494       /* The maximum TLS size allowed under small is 4G.  */
13495       if (aarch64_tls_size > 32)
13496         aarch64_tls_size = 32;
13497       break;
13498     case AARCH64_CMODEL_LARGE:
13499       /* The maximum TLS size allowed under large is 16E.
13500          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
13501       if (aarch64_tls_size > 48)
13502         aarch64_tls_size = 48;
13503       break;
13504     default:
13505       gcc_unreachable ();
13506     }
13507
13508   return;
13509 }
13510
13511 /* Parse STRING looking for options in the format:
13512      string     :: option:string
13513      option     :: name=substring
13514      name       :: {a-z}
13515      substring  :: defined by option.  */
13516
13517 static void
13518 aarch64_parse_override_string (const char* input_string,
13519                                struct tune_params* tune)
13520 {
13521   const char separator = ':';
13522   size_t string_length = strlen (input_string) + 1;
13523   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
13524   char *string = string_root;
13525   strncpy (string, input_string, string_length);
13526   string[string_length - 1] = '\0';
13527
13528   char* ntoken = string;
13529
13530   while ((ntoken = strchr (string, separator)))
13531     {
13532       size_t token_length = ntoken - string;
13533       /* Make this substring look like a string.  */
13534       *ntoken = '\0';
13535       aarch64_parse_one_override_token (string, token_length, tune);
13536       string = ++ntoken;
13537     }
13538
13539   /* One last option to parse.  */
13540   aarch64_parse_one_override_token (string, strlen (string), tune);
13541   free (string_root);
13542 }
13543
13544
13545 static void
13546 aarch64_override_options_after_change_1 (struct gcc_options *opts)
13547 {
13548   if (accepted_branch_protection_string)
13549     {
13550       opts->x_aarch64_branch_protection_string
13551         = xstrdup (accepted_branch_protection_string);
13552     }
13553
13554   /* PR 70044: We have to be careful about being called multiple times for the
13555      same function.  This means all changes should be repeatable.  */
13556
13557   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
13558      Disable the frame pointer flag so the mid-end will not use a frame
13559      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
13560      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
13561      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
13562   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
13563   if (opts->x_flag_omit_frame_pointer == 0)
13564     opts->x_flag_omit_frame_pointer = 2;
13565
13566   /* If not optimizing for size, set the default
13567      alignment to what the target wants.  */
13568   if (!opts->x_optimize_size)
13569     {
13570       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
13571         opts->x_str_align_loops = aarch64_tune_params.loop_align;
13572       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
13573         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
13574       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
13575         opts->x_str_align_functions = aarch64_tune_params.function_align;
13576     }
13577
13578   /* We default to no pc-relative literal loads.  */
13579
13580   aarch64_pcrelative_literal_loads = false;
13581
13582   /* If -mpc-relative-literal-loads is set on the command line, this
13583      implies that the user asked for PC relative literal loads.  */
13584   if (opts->x_pcrelative_literal_loads == 1)
13585     aarch64_pcrelative_literal_loads = true;
13586
13587   /* In the tiny memory model it makes no sense to disallow PC relative
13588      literal pool loads.  */
13589   if (aarch64_cmodel == AARCH64_CMODEL_TINY
13590       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
13591     aarch64_pcrelative_literal_loads = true;
13592
13593   /* When enabling the lower precision Newton series for the square root, also
13594      enable it for the reciprocal square root, since the latter is an
13595      intermediary step for the former.  */
13596   if (flag_mlow_precision_sqrt)
13597     flag_mrecip_low_precision_sqrt = true;
13598 }
13599
13600 /* 'Unpack' up the internal tuning structs and update the options
13601     in OPTS.  The caller must have set up selected_tune and selected_arch
13602     as all the other target-specific codegen decisions are
13603     derived from them.  */
13604
13605 void
13606 aarch64_override_options_internal (struct gcc_options *opts)
13607 {
13608   aarch64_tune_flags = selected_tune->flags;
13609   aarch64_tune = selected_tune->sched_core;
13610   /* Make a copy of the tuning parameters attached to the core, which
13611      we may later overwrite.  */
13612   aarch64_tune_params = *(selected_tune->tune);
13613   aarch64_architecture_version = selected_arch->architecture_version;
13614
13615   if (opts->x_aarch64_override_tune_string)
13616     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
13617                                   &aarch64_tune_params);
13618
13619   /* This target defaults to strict volatile bitfields.  */
13620   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
13621     opts->x_flag_strict_volatile_bitfields = 1;
13622
13623   if (aarch64_stack_protector_guard == SSP_GLOBAL
13624       && opts->x_aarch64_stack_protector_guard_offset_str)
13625     {
13626       error ("incompatible options %<-mstack-protector-guard=global%> and "
13627              "%<-mstack-protector-guard-offset=%s%>",
13628              aarch64_stack_protector_guard_offset_str);
13629     }
13630
13631   if (aarch64_stack_protector_guard == SSP_SYSREG
13632       && !(opts->x_aarch64_stack_protector_guard_offset_str
13633            && opts->x_aarch64_stack_protector_guard_reg_str))
13634     {
13635       error ("both %<-mstack-protector-guard-offset%> and "
13636              "%<-mstack-protector-guard-reg%> must be used "
13637              "with %<-mstack-protector-guard=sysreg%>");
13638     }
13639
13640   if (opts->x_aarch64_stack_protector_guard_reg_str)
13641     {
13642       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
13643           error ("specify a system register with a small string length.");
13644     }
13645
13646   if (opts->x_aarch64_stack_protector_guard_offset_str)
13647     {
13648       char *end;
13649       const char *str = aarch64_stack_protector_guard_offset_str;
13650       errno = 0;
13651       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
13652       if (!*str || *end || errno)
13653         error ("%qs is not a valid offset in %qs", str,
13654                "-mstack-protector-guard-offset=");
13655       aarch64_stack_protector_guard_offset = offs;
13656     }
13657
13658   initialize_aarch64_code_model (opts);
13659   initialize_aarch64_tls_size (opts);
13660
13661   int queue_depth = 0;
13662   switch (aarch64_tune_params.autoprefetcher_model)
13663     {
13664       case tune_params::AUTOPREFETCHER_OFF:
13665         queue_depth = -1;
13666         break;
13667       case tune_params::AUTOPREFETCHER_WEAK:
13668         queue_depth = 0;
13669         break;
13670       case tune_params::AUTOPREFETCHER_STRONG:
13671         queue_depth = max_insn_queue_index + 1;
13672         break;
13673       default:
13674         gcc_unreachable ();
13675     }
13676
13677   /* We don't mind passing in global_options_set here as we don't use
13678      the *options_set structs anyway.  */
13679   SET_OPTION_IF_UNSET (opts, &global_options_set,
13680                        param_sched_autopref_queue_depth, queue_depth);
13681
13682   /* Set up parameters to be used in prefetching algorithm.  Do not
13683      override the defaults unless we are tuning for a core we have
13684      researched values for.  */
13685   if (aarch64_tune_params.prefetch->num_slots > 0)
13686     SET_OPTION_IF_UNSET (opts, &global_options_set,
13687                          param_simultaneous_prefetches,
13688                          aarch64_tune_params.prefetch->num_slots);
13689   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
13690     SET_OPTION_IF_UNSET (opts, &global_options_set,
13691                          param_l1_cache_size,
13692                          aarch64_tune_params.prefetch->l1_cache_size);
13693   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
13694     SET_OPTION_IF_UNSET (opts, &global_options_set,
13695                          param_l1_cache_line_size,
13696                          aarch64_tune_params.prefetch->l1_cache_line_size);
13697   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
13698     SET_OPTION_IF_UNSET (opts, &global_options_set,
13699                          param_l2_cache_size,
13700                          aarch64_tune_params.prefetch->l2_cache_size);
13701   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
13702     SET_OPTION_IF_UNSET (opts, &global_options_set,
13703                          param_prefetch_dynamic_strides, 0);
13704   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
13705     SET_OPTION_IF_UNSET (opts, &global_options_set,
13706                          param_prefetch_minimum_stride,
13707                          aarch64_tune_params.prefetch->minimum_stride);
13708
13709   /* Use the alternative scheduling-pressure algorithm by default.  */
13710   SET_OPTION_IF_UNSET (opts, &global_options_set,
13711                        param_sched_pressure_algorithm,
13712                        SCHED_PRESSURE_MODEL);
13713
13714   /* Validate the guard size.  */
13715   int guard_size = param_stack_clash_protection_guard_size;
13716
13717   if (guard_size != 12 && guard_size != 16)
13718     error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
13719            "size.  Given value %d (%llu KB) is out of range",
13720            guard_size, (1ULL << guard_size) / 1024ULL);
13721
13722   /* Enforce that interval is the same size as size so the mid-end does the
13723      right thing.  */
13724   SET_OPTION_IF_UNSET (opts, &global_options_set,
13725                        param_stack_clash_protection_probe_interval,
13726                        guard_size);
13727
13728   /* The maybe_set calls won't update the value if the user has explicitly set
13729      one.  Which means we need to validate that probing interval and guard size
13730      are equal.  */
13731   int probe_interval
13732     = param_stack_clash_protection_probe_interval;
13733   if (guard_size != probe_interval)
13734     error ("stack clash guard size %<%d%> must be equal to probing interval "
13735            "%<%d%>", guard_size, probe_interval);
13736
13737   /* Enable sw prefetching at specified optimization level for
13738      CPUS that have prefetch.  Lower optimization level threshold by 1
13739      when profiling is enabled.  */
13740   if (opts->x_flag_prefetch_loop_arrays < 0
13741       && !opts->x_optimize_size
13742       && aarch64_tune_params.prefetch->default_opt_level >= 0
13743       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
13744     opts->x_flag_prefetch_loop_arrays = 1;
13745
13746   if (opts->x_aarch64_arch_string == NULL)
13747     opts->x_aarch64_arch_string = selected_arch->name;
13748   if (opts->x_aarch64_cpu_string == NULL)
13749     opts->x_aarch64_cpu_string = selected_cpu->name;
13750   if (opts->x_aarch64_tune_string == NULL)
13751     opts->x_aarch64_tune_string = selected_tune->name;
13752
13753   aarch64_override_options_after_change_1 (opts);
13754 }
13755
13756 /* Print a hint with a suggestion for a core or architecture name that
13757    most closely resembles what the user passed in STR.  ARCH is true if
13758    the user is asking for an architecture name.  ARCH is false if the user
13759    is asking for a core name.  */
13760
13761 static void
13762 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
13763 {
13764   auto_vec<const char *> candidates;
13765   const struct processor *entry = arch ? all_architectures : all_cores;
13766   for (; entry->name != NULL; entry++)
13767     candidates.safe_push (entry->name);
13768
13769 #ifdef HAVE_LOCAL_CPU_DETECT
13770   /* Add also "native" as possible value.  */
13771   if (arch)
13772     candidates.safe_push ("native");
13773 #endif
13774
13775   char *s;
13776   const char *hint = candidates_list_and_hint (str, s, candidates);
13777   if (hint)
13778     inform (input_location, "valid arguments are: %s;"
13779                              " did you mean %qs?", s, hint);
13780   else
13781     inform (input_location, "valid arguments are: %s", s);
13782
13783   XDELETEVEC (s);
13784 }
13785
13786 /* Print a hint with a suggestion for a core name that most closely resembles
13787    what the user passed in STR.  */
13788
13789 inline static void
13790 aarch64_print_hint_for_core (const char *str)
13791 {
13792   aarch64_print_hint_for_core_or_arch (str, false);
13793 }
13794
13795 /* Print a hint with a suggestion for an architecture name that most closely
13796    resembles what the user passed in STR.  */
13797
13798 inline static void
13799 aarch64_print_hint_for_arch (const char *str)
13800 {
13801   aarch64_print_hint_for_core_or_arch (str, true);
13802 }
13803
13804
13805 /* Print a hint with a suggestion for an extension name
13806    that most closely resembles what the user passed in STR.  */
13807
13808 void
13809 aarch64_print_hint_for_extensions (const std::string &str)
13810 {
13811   auto_vec<const char *> candidates;
13812   aarch64_get_all_extension_candidates (&candidates);
13813   char *s;
13814   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
13815   if (hint)
13816     inform (input_location, "valid arguments are: %s;"
13817                              " did you mean %qs?", s, hint);
13818   else
13819     inform (input_location, "valid arguments are: %s;", s);
13820
13821   XDELETEVEC (s);
13822 }
13823
13824 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
13825    specified in STR and throw errors if appropriate.  Put the results if
13826    they are valid in RES and ISA_FLAGS.  Return whether the option is
13827    valid.  */
13828
13829 static bool
13830 aarch64_validate_mcpu (const char *str, const struct processor **res,
13831                        uint64_t *isa_flags)
13832 {
13833   std::string invalid_extension;
13834   enum aarch64_parse_opt_result parse_res
13835     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
13836
13837   if (parse_res == AARCH64_PARSE_OK)
13838     return true;
13839
13840   switch (parse_res)
13841     {
13842       case AARCH64_PARSE_MISSING_ARG:
13843         error ("missing cpu name in %<-mcpu=%s%>", str);
13844         break;
13845       case AARCH64_PARSE_INVALID_ARG:
13846         error ("unknown value %qs for %<-mcpu%>", str);
13847         aarch64_print_hint_for_core (str);
13848         break;
13849       case AARCH64_PARSE_INVALID_FEATURE:
13850         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
13851                invalid_extension.c_str (), str);
13852         aarch64_print_hint_for_extensions (invalid_extension);
13853         break;
13854       default:
13855         gcc_unreachable ();
13856     }
13857
13858   return false;
13859 }
13860
13861 /* Parses CONST_STR for branch protection features specified in
13862    aarch64_branch_protect_types, and set any global variables required.  Returns
13863    the parsing result and assigns LAST_STR to the last processed token from
13864    CONST_STR so that it can be used for error reporting.  */
13865
13866 static enum
13867 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
13868                                                           char** last_str)
13869 {
13870   char *str_root = xstrdup (const_str);
13871   char* token_save = NULL;
13872   char *str = strtok_r (str_root, "+", &token_save);
13873   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
13874   if (!str)
13875     res = AARCH64_PARSE_MISSING_ARG;
13876   else
13877     {
13878       char *next_str = strtok_r (NULL, "+", &token_save);
13879       /* Reset the branch protection features to their defaults.  */
13880       aarch64_handle_no_branch_protection (NULL, NULL);
13881
13882       while (str && res == AARCH64_PARSE_OK)
13883         {
13884           const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
13885           bool found = false;
13886           /* Search for this type.  */
13887           while (type && type->name && !found && res == AARCH64_PARSE_OK)
13888             {
13889               if (strcmp (str, type->name) == 0)
13890                 {
13891                   found = true;
13892                   res = type->handler (str, next_str);
13893                   str = next_str;
13894                   next_str = strtok_r (NULL, "+", &token_save);
13895                 }
13896               else
13897                 type++;
13898             }
13899           if (found && res == AARCH64_PARSE_OK)
13900             {
13901               bool found_subtype = true;
13902               /* Loop through each token until we find one that isn't a
13903                  subtype.  */
13904               while (found_subtype)
13905                 {
13906                   found_subtype = false;
13907                   const aarch64_branch_protect_type *subtype = type->subtypes;
13908                   /* Search for the subtype.  */
13909                   while (str && subtype && subtype->name && !found_subtype
13910                           && res == AARCH64_PARSE_OK)
13911                     {
13912                       if (strcmp (str, subtype->name) == 0)
13913                         {
13914                           found_subtype = true;
13915                           res = subtype->handler (str, next_str);
13916                           str = next_str;
13917                           next_str = strtok_r (NULL, "+", &token_save);
13918                         }
13919                       else
13920                         subtype++;
13921                     }
13922                 }
13923             }
13924           else if (!found)
13925             res = AARCH64_PARSE_INVALID_ARG;
13926         }
13927     }
13928   /* Copy the last processed token into the argument to pass it back.
13929     Used by option and attribute validation to print the offending token.  */
13930   if (last_str)
13931     {
13932       if (str) strcpy (*last_str, str);
13933       else *last_str = NULL;
13934     }
13935   if (res == AARCH64_PARSE_OK)
13936     {
13937       /* If needed, alloc the accepted string then copy in const_str.
13938         Used by override_option_after_change_1.  */
13939       if (!accepted_branch_protection_string)
13940         accepted_branch_protection_string = (char *) xmalloc (
13941                                                       BRANCH_PROTECT_STR_MAX
13942                                                         + 1);
13943       strncpy (accepted_branch_protection_string, const_str,
13944                 BRANCH_PROTECT_STR_MAX + 1);
13945       /* Forcibly null-terminate.  */
13946       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
13947     }
13948   return res;
13949 }
13950
13951 static bool
13952 aarch64_validate_mbranch_protection (const char *const_str)
13953 {
13954   char *str = (char *) xmalloc (strlen (const_str));
13955   enum aarch64_parse_opt_result res =
13956     aarch64_parse_branch_protection (const_str, &str);
13957   if (res == AARCH64_PARSE_INVALID_ARG)
13958     error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
13959   else if (res == AARCH64_PARSE_MISSING_ARG)
13960     error ("missing argument for %<-mbranch-protection=%>");
13961   free (str);
13962   return res == AARCH64_PARSE_OK;
13963 }
13964
13965 /* Validate a command-line -march option.  Parse the arch and extensions
13966    (if any) specified in STR and throw errors if appropriate.  Put the
13967    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
13968    option is valid.  */
13969
13970 static bool
13971 aarch64_validate_march (const char *str, const struct processor **res,
13972                          uint64_t *isa_flags)
13973 {
13974   std::string invalid_extension;
13975   enum aarch64_parse_opt_result parse_res
13976     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
13977
13978   if (parse_res == AARCH64_PARSE_OK)
13979     return true;
13980
13981   switch (parse_res)
13982     {
13983       case AARCH64_PARSE_MISSING_ARG:
13984         error ("missing arch name in %<-march=%s%>", str);
13985         break;
13986       case AARCH64_PARSE_INVALID_ARG:
13987         error ("unknown value %qs for %<-march%>", str);
13988         aarch64_print_hint_for_arch (str);
13989         break;
13990       case AARCH64_PARSE_INVALID_FEATURE:
13991         error ("invalid feature modifier %qs in %<-march=%s%>",
13992                invalid_extension.c_str (), str);
13993         aarch64_print_hint_for_extensions (invalid_extension);
13994         break;
13995       default:
13996         gcc_unreachable ();
13997     }
13998
13999   return false;
14000 }
14001
14002 /* Validate a command-line -mtune option.  Parse the cpu
14003    specified in STR and throw errors if appropriate.  Put the
14004    result, if it is valid, in RES.  Return whether the option is
14005    valid.  */
14006
14007 static bool
14008 aarch64_validate_mtune (const char *str, const struct processor **res)
14009 {
14010   enum aarch64_parse_opt_result parse_res
14011     = aarch64_parse_tune (str, res);
14012
14013   if (parse_res == AARCH64_PARSE_OK)
14014     return true;
14015
14016   switch (parse_res)
14017     {
14018       case AARCH64_PARSE_MISSING_ARG:
14019         error ("missing cpu name in %<-mtune=%s%>", str);
14020         break;
14021       case AARCH64_PARSE_INVALID_ARG:
14022         error ("unknown value %qs for %<-mtune%>", str);
14023         aarch64_print_hint_for_core (str);
14024         break;
14025       default:
14026         gcc_unreachable ();
14027     }
14028   return false;
14029 }
14030
14031 /* Return the CPU corresponding to the enum CPU.
14032    If it doesn't specify a cpu, return the default.  */
14033
14034 static const struct processor *
14035 aarch64_get_tune_cpu (enum aarch64_processor cpu)
14036 {
14037   if (cpu != aarch64_none)
14038     return &all_cores[cpu];
14039
14040   /* The & 0x3f is to extract the bottom 6 bits that encode the
14041      default cpu as selected by the --with-cpu GCC configure option
14042      in config.gcc.
14043      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
14044      flags mechanism should be reworked to make it more sane.  */
14045   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
14046 }
14047
14048 /* Return the architecture corresponding to the enum ARCH.
14049    If it doesn't specify a valid architecture, return the default.  */
14050
14051 static const struct processor *
14052 aarch64_get_arch (enum aarch64_arch arch)
14053 {
14054   if (arch != aarch64_no_arch)
14055     return &all_architectures[arch];
14056
14057   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
14058
14059   return &all_architectures[cpu->arch];
14060 }
14061
14062 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
14063
14064 static poly_uint16
14065 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
14066 {
14067   /* 128-bit SVE and Advanced SIMD modes use different register layouts
14068      on big-endian targets, so we would need to forbid subregs that convert
14069      from one to the other.  By default a reinterpret sequence would then
14070      involve a store to memory in one mode and a load back in the other.
14071      Even if we optimize that sequence using reverse instructions,
14072      it would still be a significant potential overhead.
14073
14074      For now, it seems better to generate length-agnostic code for that
14075      case instead.  */
14076   if (value == SVE_SCALABLE
14077       || (value == SVE_128 && BYTES_BIG_ENDIAN))
14078     return poly_uint16 (2, 2);
14079   else
14080     return (int) value / 64;
14081 }
14082
14083 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
14084    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
14085    tuning structs.  In particular it must set selected_tune and
14086    aarch64_isa_flags that define the available ISA features and tuning
14087    decisions.  It must also set selected_arch as this will be used to
14088    output the .arch asm tags for each function.  */
14089
14090 static void
14091 aarch64_override_options (void)
14092 {
14093   uint64_t cpu_isa = 0;
14094   uint64_t arch_isa = 0;
14095   aarch64_isa_flags = 0;
14096
14097   bool valid_cpu = true;
14098   bool valid_tune = true;
14099   bool valid_arch = true;
14100
14101   selected_cpu = NULL;
14102   selected_arch = NULL;
14103   selected_tune = NULL;
14104
14105   if (aarch64_branch_protection_string)
14106     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
14107
14108   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
14109      If either of -march or -mtune is given, they override their
14110      respective component of -mcpu.  */
14111   if (aarch64_cpu_string)
14112     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
14113                                         &cpu_isa);
14114
14115   if (aarch64_arch_string)
14116     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
14117                                           &arch_isa);
14118
14119   if (aarch64_tune_string)
14120     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
14121
14122 #ifdef SUBTARGET_OVERRIDE_OPTIONS
14123   SUBTARGET_OVERRIDE_OPTIONS;
14124 #endif
14125
14126   /* If the user did not specify a processor, choose the default
14127      one for them.  This will be the CPU set during configuration using
14128      --with-cpu, otherwise it is "generic".  */
14129   if (!selected_cpu)
14130     {
14131       if (selected_arch)
14132         {
14133           selected_cpu = &all_cores[selected_arch->ident];
14134           aarch64_isa_flags = arch_isa;
14135           explicit_arch = selected_arch->arch;
14136         }
14137       else
14138         {
14139           /* Get default configure-time CPU.  */
14140           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
14141           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
14142         }
14143
14144       if (selected_tune)
14145         explicit_tune_core = selected_tune->ident;
14146     }
14147   /* If both -mcpu and -march are specified check that they are architecturally
14148      compatible, warn if they're not and prefer the -march ISA flags.  */
14149   else if (selected_arch)
14150     {
14151       if (selected_arch->arch != selected_cpu->arch)
14152         {
14153           warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
14154                        aarch64_cpu_string,
14155                        aarch64_arch_string);
14156         }
14157       aarch64_isa_flags = arch_isa;
14158       explicit_arch = selected_arch->arch;
14159       explicit_tune_core = selected_tune ? selected_tune->ident
14160                                           : selected_cpu->ident;
14161     }
14162   else
14163     {
14164       /* -mcpu but no -march.  */
14165       aarch64_isa_flags = cpu_isa;
14166       explicit_tune_core = selected_tune ? selected_tune->ident
14167                                           : selected_cpu->ident;
14168       gcc_assert (selected_cpu);
14169       selected_arch = &all_architectures[selected_cpu->arch];
14170       explicit_arch = selected_arch->arch;
14171     }
14172
14173   /* Set the arch as well as we will need it when outputing
14174      the .arch directive in assembly.  */
14175   if (!selected_arch)
14176     {
14177       gcc_assert (selected_cpu);
14178       selected_arch = &all_architectures[selected_cpu->arch];
14179     }
14180
14181   if (!selected_tune)
14182     selected_tune = selected_cpu;
14183
14184   if (aarch64_enable_bti == 2)
14185     {
14186 #ifdef TARGET_ENABLE_BTI
14187       aarch64_enable_bti = 1;
14188 #else
14189       aarch64_enable_bti = 0;
14190 #endif
14191     }
14192
14193   /* Return address signing is currently not supported for ILP32 targets.  For
14194      LP64 targets use the configured option in the absence of a command-line
14195      option for -mbranch-protection.  */
14196   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
14197     {
14198 #ifdef TARGET_ENABLE_PAC_RET
14199       aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
14200 #else
14201       aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
14202 #endif
14203     }
14204
14205 #ifndef HAVE_AS_MABI_OPTION
14206   /* The compiler may have been configured with 2.23.* binutils, which does
14207      not have support for ILP32.  */
14208   if (TARGET_ILP32)
14209     error ("assembler does not support %<-mabi=ilp32%>");
14210 #endif
14211
14212   /* Convert -msve-vector-bits to a VG count.  */
14213   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
14214
14215   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
14216     sorry ("return address signing is only supported for %<-mabi=lp64%>");
14217
14218   /* Make sure we properly set up the explicit options.  */
14219   if ((aarch64_cpu_string && valid_cpu)
14220        || (aarch64_tune_string && valid_tune))
14221     gcc_assert (explicit_tune_core != aarch64_none);
14222
14223   if ((aarch64_cpu_string && valid_cpu)
14224        || (aarch64_arch_string && valid_arch))
14225     gcc_assert (explicit_arch != aarch64_no_arch);
14226
14227   /* The pass to insert speculation tracking runs before
14228      shrink-wrapping and the latter does not know how to update the
14229      tracking status.  So disable it in this case.  */
14230   if (aarch64_track_speculation)
14231     flag_shrink_wrap = 0;
14232
14233   aarch64_override_options_internal (&global_options);
14234
14235   /* Save these options as the default ones in case we push and pop them later
14236      while processing functions with potential target attributes.  */
14237   target_option_default_node = target_option_current_node
14238       = build_target_option_node (&global_options);
14239 }
14240
14241 /* Implement targetm.override_options_after_change.  */
14242
14243 static void
14244 aarch64_override_options_after_change (void)
14245 {
14246   aarch64_override_options_after_change_1 (&global_options);
14247 }
14248
14249 static struct machine_function *
14250 aarch64_init_machine_status (void)
14251 {
14252   struct machine_function *machine;
14253   machine = ggc_cleared_alloc<machine_function> ();
14254   return machine;
14255 }
14256
14257 void
14258 aarch64_init_expanders (void)
14259 {
14260   init_machine_status = aarch64_init_machine_status;
14261 }
14262
14263 /* A checking mechanism for the implementation of the various code models.  */
14264 static void
14265 initialize_aarch64_code_model (struct gcc_options *opts)
14266 {
14267    if (opts->x_flag_pic)
14268      {
14269        switch (opts->x_aarch64_cmodel_var)
14270          {
14271          case AARCH64_CMODEL_TINY:
14272            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
14273            break;
14274          case AARCH64_CMODEL_SMALL:
14275 #ifdef HAVE_AS_SMALL_PIC_RELOCS
14276            aarch64_cmodel = (flag_pic == 2
14277                              ? AARCH64_CMODEL_SMALL_PIC
14278                              : AARCH64_CMODEL_SMALL_SPIC);
14279 #else
14280            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
14281 #endif
14282            break;
14283          case AARCH64_CMODEL_LARGE:
14284            sorry ("code model %qs with %<-f%s%>", "large",
14285                   opts->x_flag_pic > 1 ? "PIC" : "pic");
14286            break;
14287          default:
14288            gcc_unreachable ();
14289          }
14290      }
14291    else
14292      aarch64_cmodel = opts->x_aarch64_cmodel_var;
14293 }
14294
14295 /* Implement TARGET_OPTION_SAVE.  */
14296
14297 static void
14298 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
14299 {
14300   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
14301   ptr->x_aarch64_branch_protection_string
14302     = opts->x_aarch64_branch_protection_string;
14303 }
14304
14305 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
14306    using the information saved in PTR.  */
14307
14308 static void
14309 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
14310 {
14311   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
14312   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
14313   opts->x_explicit_arch = ptr->x_explicit_arch;
14314   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
14315   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
14316   opts->x_aarch64_branch_protection_string
14317     = ptr->x_aarch64_branch_protection_string;
14318   if (opts->x_aarch64_branch_protection_string)
14319     {
14320       aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
14321                                         NULL);
14322     }
14323
14324   aarch64_override_options_internal (opts);
14325 }
14326
14327 /* Implement TARGET_OPTION_PRINT.  */
14328
14329 static void
14330 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
14331 {
14332   const struct processor *cpu
14333     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
14334   uint64_t isa_flags = ptr->x_aarch64_isa_flags;
14335   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
14336   std::string extension
14337     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
14338
14339   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
14340   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
14341            arch->name, extension.c_str ());
14342 }
14343
14344 static GTY(()) tree aarch64_previous_fndecl;
14345
14346 void
14347 aarch64_reset_previous_fndecl (void)
14348 {
14349   aarch64_previous_fndecl = NULL;
14350 }
14351
14352 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
14353    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
14354    make sure optab availability predicates are recomputed when necessary.  */
14355
14356 void
14357 aarch64_save_restore_target_globals (tree new_tree)
14358 {
14359   if (TREE_TARGET_GLOBALS (new_tree))
14360     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
14361   else if (new_tree == target_option_default_node)
14362     restore_target_globals (&default_target_globals);
14363   else
14364     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
14365 }
14366
14367 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
14368    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
14369    of the function, if such exists.  This function may be called multiple
14370    times on a single function so use aarch64_previous_fndecl to avoid
14371    setting up identical state.  */
14372
14373 static void
14374 aarch64_set_current_function (tree fndecl)
14375 {
14376   if (!fndecl || fndecl == aarch64_previous_fndecl)
14377     return;
14378
14379   tree old_tree = (aarch64_previous_fndecl
14380                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
14381                    : NULL_TREE);
14382
14383   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14384
14385   /* If current function has no attributes but the previous one did,
14386      use the default node.  */
14387   if (!new_tree && old_tree)
14388     new_tree = target_option_default_node;
14389
14390   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
14391      the default have been handled by aarch64_save_restore_target_globals from
14392      aarch64_pragma_target_parse.  */
14393   if (old_tree == new_tree)
14394     return;
14395
14396   aarch64_previous_fndecl = fndecl;
14397
14398   /* First set the target options.  */
14399   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
14400
14401   aarch64_save_restore_target_globals (new_tree);
14402 }
14403
14404 /* Enum describing the various ways we can handle attributes.
14405    In many cases we can reuse the generic option handling machinery.  */
14406
14407 enum aarch64_attr_opt_type
14408 {
14409   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
14410   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
14411   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
14412   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
14413 };
14414
14415 /* All the information needed to handle a target attribute.
14416    NAME is the name of the attribute.
14417    ATTR_TYPE specifies the type of behavior of the attribute as described
14418    in the definition of enum aarch64_attr_opt_type.
14419    ALLOW_NEG is true if the attribute supports a "no-" form.
14420    HANDLER is the function that takes the attribute string as an argument
14421    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
14422    OPT_NUM is the enum specifying the option that the attribute modifies.
14423    This is needed for attributes that mirror the behavior of a command-line
14424    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
14425    aarch64_attr_enum.  */
14426
14427 struct aarch64_attribute_info
14428 {
14429   const char *name;
14430   enum aarch64_attr_opt_type attr_type;
14431   bool allow_neg;
14432   bool (*handler) (const char *);
14433   enum opt_code opt_num;
14434 };
14435
14436 /* Handle the ARCH_STR argument to the arch= target attribute.  */
14437
14438 static bool
14439 aarch64_handle_attr_arch (const char *str)
14440 {
14441   const struct processor *tmp_arch = NULL;
14442   std::string invalid_extension;
14443   enum aarch64_parse_opt_result parse_res
14444     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
14445
14446   if (parse_res == AARCH64_PARSE_OK)
14447     {
14448       gcc_assert (tmp_arch);
14449       selected_arch = tmp_arch;
14450       explicit_arch = selected_arch->arch;
14451       return true;
14452     }
14453
14454   switch (parse_res)
14455     {
14456       case AARCH64_PARSE_MISSING_ARG:
14457         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
14458         break;
14459       case AARCH64_PARSE_INVALID_ARG:
14460         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
14461         aarch64_print_hint_for_arch (str);
14462         break;
14463       case AARCH64_PARSE_INVALID_FEATURE:
14464         error ("invalid feature modifier %s of value (\"%s\") in "
14465                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
14466         aarch64_print_hint_for_extensions (invalid_extension);
14467         break;
14468       default:
14469         gcc_unreachable ();
14470     }
14471
14472   return false;
14473 }
14474
14475 /* Handle the argument CPU_STR to the cpu= target attribute.  */
14476
14477 static bool
14478 aarch64_handle_attr_cpu (const char *str)
14479 {
14480   const struct processor *tmp_cpu = NULL;
14481   std::string invalid_extension;
14482   enum aarch64_parse_opt_result parse_res
14483     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
14484
14485   if (parse_res == AARCH64_PARSE_OK)
14486     {
14487       gcc_assert (tmp_cpu);
14488       selected_tune = tmp_cpu;
14489       explicit_tune_core = selected_tune->ident;
14490
14491       selected_arch = &all_architectures[tmp_cpu->arch];
14492       explicit_arch = selected_arch->arch;
14493       return true;
14494     }
14495
14496   switch (parse_res)
14497     {
14498       case AARCH64_PARSE_MISSING_ARG:
14499         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
14500         break;
14501       case AARCH64_PARSE_INVALID_ARG:
14502         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
14503         aarch64_print_hint_for_core (str);
14504         break;
14505       case AARCH64_PARSE_INVALID_FEATURE:
14506         error ("invalid feature modifier %s of value (\"%s\") in "
14507                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
14508         aarch64_print_hint_for_extensions (invalid_extension);
14509         break;
14510       default:
14511         gcc_unreachable ();
14512     }
14513
14514   return false;
14515 }
14516
14517 /* Handle the argument STR to the branch-protection= attribute.  */
14518
14519  static bool
14520  aarch64_handle_attr_branch_protection (const char* str)
14521  {
14522   char *err_str = (char *) xmalloc (strlen (str) + 1);
14523   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
14524                                                                       &err_str);
14525   bool success = false;
14526   switch (res)
14527     {
14528      case AARCH64_PARSE_MISSING_ARG:
14529        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
14530               " attribute");
14531        break;
14532      case AARCH64_PARSE_INVALID_ARG:
14533        error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
14534               "=\")%> pragma or attribute", err_str);
14535        break;
14536      case AARCH64_PARSE_OK:
14537        success = true;
14538       /* Fall through.  */
14539      case AARCH64_PARSE_INVALID_FEATURE:
14540        break;
14541      default:
14542        gcc_unreachable ();
14543     }
14544   free (err_str);
14545   return success;
14546  }
14547
14548 /* Handle the argument STR to the tune= target attribute.  */
14549
14550 static bool
14551 aarch64_handle_attr_tune (const char *str)
14552 {
14553   const struct processor *tmp_tune = NULL;
14554   enum aarch64_parse_opt_result parse_res
14555     = aarch64_parse_tune (str, &tmp_tune);
14556
14557   if (parse_res == AARCH64_PARSE_OK)
14558     {
14559       gcc_assert (tmp_tune);
14560       selected_tune = tmp_tune;
14561       explicit_tune_core = selected_tune->ident;
14562       return true;
14563     }
14564
14565   switch (parse_res)
14566     {
14567       case AARCH64_PARSE_INVALID_ARG:
14568         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
14569         aarch64_print_hint_for_core (str);
14570         break;
14571       default:
14572         gcc_unreachable ();
14573     }
14574
14575   return false;
14576 }
14577
14578 /* Parse an architecture extensions target attribute string specified in STR.
14579    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
14580    if successful.  Update aarch64_isa_flags to reflect the ISA features
14581    modified.  */
14582
14583 static bool
14584 aarch64_handle_attr_isa_flags (char *str)
14585 {
14586   enum aarch64_parse_opt_result parse_res;
14587   uint64_t isa_flags = aarch64_isa_flags;
14588
14589   /* We allow "+nothing" in the beginning to clear out all architectural
14590      features if the user wants to handpick specific features.  */
14591   if (strncmp ("+nothing", str, 8) == 0)
14592     {
14593       isa_flags = 0;
14594       str += 8;
14595     }
14596
14597   std::string invalid_extension;
14598   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
14599
14600   if (parse_res == AARCH64_PARSE_OK)
14601     {
14602       aarch64_isa_flags = isa_flags;
14603       return true;
14604     }
14605
14606   switch (parse_res)
14607     {
14608       case AARCH64_PARSE_MISSING_ARG:
14609         error ("missing value in %<target()%> pragma or attribute");
14610         break;
14611
14612       case AARCH64_PARSE_INVALID_FEATURE:
14613         error ("invalid feature modifier %s of value (\"%s\") in "
14614                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
14615         break;
14616
14617       default:
14618         gcc_unreachable ();
14619     }
14620
14621  return false;
14622 }
14623
14624 /* The target attributes that we support.  On top of these we also support just
14625    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
14626    handled explicitly in aarch64_process_one_target_attr.  */
14627
14628 static const struct aarch64_attribute_info aarch64_attributes[] =
14629 {
14630   { "general-regs-only", aarch64_attr_mask, false, NULL,
14631      OPT_mgeneral_regs_only },
14632   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
14633      OPT_mfix_cortex_a53_835769 },
14634   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
14635      OPT_mfix_cortex_a53_843419 },
14636   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
14637   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
14638   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
14639      OPT_momit_leaf_frame_pointer },
14640   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
14641   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
14642      OPT_march_ },
14643   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
14644   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
14645      OPT_mtune_ },
14646   { "branch-protection", aarch64_attr_custom, false,
14647      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
14648   { "sign-return-address", aarch64_attr_enum, false, NULL,
14649      OPT_msign_return_address_ },
14650   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
14651 };
14652
14653 /* Parse ARG_STR which contains the definition of one target attribute.
14654    Show appropriate errors if any or return true if the attribute is valid.  */
14655
14656 static bool
14657 aarch64_process_one_target_attr (char *arg_str)
14658 {
14659   bool invert = false;
14660
14661   size_t len = strlen (arg_str);
14662
14663   if (len == 0)
14664     {
14665       error ("malformed %<target()%> pragma or attribute");
14666       return false;
14667     }
14668
14669   char *str_to_check = (char *) alloca (len + 1);
14670   strcpy (str_to_check, arg_str);
14671
14672   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
14673      It is easier to detect and handle it explicitly here rather than going
14674      through the machinery for the rest of the target attributes in this
14675      function.  */
14676   if (*str_to_check == '+')
14677     return aarch64_handle_attr_isa_flags (str_to_check);
14678
14679   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
14680     {
14681       invert = true;
14682       str_to_check += 3;
14683     }
14684   char *arg = strchr (str_to_check, '=');
14685
14686   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
14687      and point ARG to "foo".  */
14688   if (arg)
14689     {
14690       *arg = '\0';
14691       arg++;
14692     }
14693   const struct aarch64_attribute_info *p_attr;
14694   bool found = false;
14695   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
14696     {
14697       /* If the names don't match up, or the user has given an argument
14698          to an attribute that doesn't accept one, or didn't give an argument
14699          to an attribute that expects one, fail to match.  */
14700       if (strcmp (str_to_check, p_attr->name) != 0)
14701         continue;
14702
14703       found = true;
14704       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
14705                               || p_attr->attr_type == aarch64_attr_enum;
14706
14707       if (attr_need_arg_p ^ (arg != NULL))
14708         {
14709           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
14710           return false;
14711         }
14712
14713       /* If the name matches but the attribute does not allow "no-" versions
14714          then we can't match.  */
14715       if (invert && !p_attr->allow_neg)
14716         {
14717           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
14718           return false;
14719         }
14720
14721       switch (p_attr->attr_type)
14722         {
14723         /* Has a custom handler registered.
14724            For example, cpu=, arch=, tune=.  */
14725           case aarch64_attr_custom:
14726             gcc_assert (p_attr->handler);
14727             if (!p_attr->handler (arg))
14728               return false;
14729             break;
14730
14731           /* Either set or unset a boolean option.  */
14732           case aarch64_attr_bool:
14733             {
14734               struct cl_decoded_option decoded;
14735
14736               generate_option (p_attr->opt_num, NULL, !invert,
14737                                CL_TARGET, &decoded);
14738               aarch64_handle_option (&global_options, &global_options_set,
14739                                       &decoded, input_location);
14740               break;
14741             }
14742           /* Set or unset a bit in the target_flags.  aarch64_handle_option
14743              should know what mask to apply given the option number.  */
14744           case aarch64_attr_mask:
14745             {
14746               struct cl_decoded_option decoded;
14747               /* We only need to specify the option number.
14748                  aarch64_handle_option will know which mask to apply.  */
14749               decoded.opt_index = p_attr->opt_num;
14750               decoded.value = !invert;
14751               aarch64_handle_option (&global_options, &global_options_set,
14752                                       &decoded, input_location);
14753               break;
14754             }
14755           /* Use the option setting machinery to set an option to an enum.  */
14756           case aarch64_attr_enum:
14757             {
14758               gcc_assert (arg);
14759               bool valid;
14760               int value;
14761               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
14762                                               &value, CL_TARGET);
14763               if (valid)
14764                 {
14765                   set_option (&global_options, NULL, p_attr->opt_num, value,
14766                               NULL, DK_UNSPECIFIED, input_location,
14767                               global_dc);
14768                 }
14769               else
14770                 {
14771                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
14772                 }
14773               break;
14774             }
14775           default:
14776             gcc_unreachable ();
14777         }
14778     }
14779
14780   /* If we reached here we either have found an attribute and validated
14781      it or didn't match any.  If we matched an attribute but its arguments
14782      were malformed we will have returned false already.  */
14783   return found;
14784 }
14785
14786 /* Count how many times the character C appears in
14787    NULL-terminated string STR.  */
14788
14789 static unsigned int
14790 num_occurences_in_str (char c, char *str)
14791 {
14792   unsigned int res = 0;
14793   while (*str != '\0')
14794     {
14795       if (*str == c)
14796         res++;
14797
14798       str++;
14799     }
14800
14801   return res;
14802 }
14803
14804 /* Parse the tree in ARGS that contains the target attribute information
14805    and update the global target options space.  */
14806
14807 bool
14808 aarch64_process_target_attr (tree args)
14809 {
14810   if (TREE_CODE (args) == TREE_LIST)
14811     {
14812       do
14813         {
14814           tree head = TREE_VALUE (args);
14815           if (head)
14816             {
14817               if (!aarch64_process_target_attr (head))
14818                 return false;
14819             }
14820           args = TREE_CHAIN (args);
14821         } while (args);
14822
14823       return true;
14824     }
14825
14826   if (TREE_CODE (args) != STRING_CST)
14827     {
14828       error ("attribute %<target%> argument not a string");
14829       return false;
14830     }
14831
14832   size_t len = strlen (TREE_STRING_POINTER (args));
14833   char *str_to_check = (char *) alloca (len + 1);
14834   strcpy (str_to_check, TREE_STRING_POINTER (args));
14835
14836   if (len == 0)
14837     {
14838       error ("malformed %<target()%> pragma or attribute");
14839       return false;
14840     }
14841
14842   /* Used to catch empty spaces between commas i.e.
14843      attribute ((target ("attr1,,attr2"))).  */
14844   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
14845
14846   /* Handle multiple target attributes separated by ','.  */
14847   char *token = strtok_r (str_to_check, ",", &str_to_check);
14848
14849   unsigned int num_attrs = 0;
14850   while (token)
14851     {
14852       num_attrs++;
14853       if (!aarch64_process_one_target_attr (token))
14854         {
14855           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
14856           return false;
14857         }
14858
14859       token = strtok_r (NULL, ",", &str_to_check);
14860     }
14861
14862   if (num_attrs != num_commas + 1)
14863     {
14864       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
14865       return false;
14866     }
14867
14868   return true;
14869 }
14870
14871 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
14872    process attribute ((target ("..."))).  */
14873
14874 static bool
14875 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
14876 {
14877   struct cl_target_option cur_target;
14878   bool ret;
14879   tree old_optimize;
14880   tree new_target, new_optimize;
14881   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14882
14883   /* If what we're processing is the current pragma string then the
14884      target option node is already stored in target_option_current_node
14885      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
14886      having to re-parse the string.  This is especially useful to keep
14887      arm_neon.h compile times down since that header contains a lot
14888      of intrinsics enclosed in pragmas.  */
14889   if (!existing_target && args == current_target_pragma)
14890     {
14891       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
14892       return true;
14893     }
14894   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
14895
14896   old_optimize = build_optimization_node (&global_options);
14897   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
14898
14899   /* If the function changed the optimization levels as well as setting
14900      target options, start with the optimizations specified.  */
14901   if (func_optimize && func_optimize != old_optimize)
14902     cl_optimization_restore (&global_options,
14903                              TREE_OPTIMIZATION (func_optimize));
14904
14905   /* Save the current target options to restore at the end.  */
14906   cl_target_option_save (&cur_target, &global_options);
14907
14908   /* If fndecl already has some target attributes applied to it, unpack
14909      them so that we add this attribute on top of them, rather than
14910      overwriting them.  */
14911   if (existing_target)
14912     {
14913       struct cl_target_option *existing_options
14914         = TREE_TARGET_OPTION (existing_target);
14915
14916       if (existing_options)
14917         cl_target_option_restore (&global_options, existing_options);
14918     }
14919   else
14920     cl_target_option_restore (&global_options,
14921                         TREE_TARGET_OPTION (target_option_current_node));
14922
14923   ret = aarch64_process_target_attr (args);
14924
14925   /* Set up any additional state.  */
14926   if (ret)
14927     {
14928       aarch64_override_options_internal (&global_options);
14929       /* Initialize SIMD builtins if we haven't already.
14930          Set current_target_pragma to NULL for the duration so that
14931          the builtin initialization code doesn't try to tag the functions
14932          being built with the attributes specified by any current pragma, thus
14933          going into an infinite recursion.  */
14934       if (TARGET_SIMD)
14935         {
14936           tree saved_current_target_pragma = current_target_pragma;
14937           current_target_pragma = NULL;
14938           aarch64_init_simd_builtins ();
14939           current_target_pragma = saved_current_target_pragma;
14940         }
14941       new_target = build_target_option_node (&global_options);
14942     }
14943   else
14944     new_target = NULL;
14945
14946   new_optimize = build_optimization_node (&global_options);
14947
14948   if (fndecl && ret)
14949     {
14950       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
14951
14952       if (old_optimize != new_optimize)
14953         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
14954     }
14955
14956   cl_target_option_restore (&global_options, &cur_target);
14957
14958   if (old_optimize != new_optimize)
14959     cl_optimization_restore (&global_options,
14960                              TREE_OPTIMIZATION (old_optimize));
14961   return ret;
14962 }
14963
14964 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
14965    tri-bool options (yes, no, don't care) and the default value is
14966    DEF, determine whether to reject inlining.  */
14967
14968 static bool
14969 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
14970                                      int dont_care, int def)
14971 {
14972   /* If the callee doesn't care, always allow inlining.  */
14973   if (callee == dont_care)
14974     return true;
14975
14976   /* If the caller doesn't care, always allow inlining.  */
14977   if (caller == dont_care)
14978     return true;
14979
14980   /* Otherwise, allow inlining if either the callee and caller values
14981      agree, or if the callee is using the default value.  */
14982   return (callee == caller || callee == def);
14983 }
14984
14985 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
14986    to inline CALLEE into CALLER based on target-specific info.
14987    Make sure that the caller and callee have compatible architectural
14988    features.  Then go through the other possible target attributes
14989    and see if they can block inlining.  Try not to reject always_inline
14990    callees unless they are incompatible architecturally.  */
14991
14992 static bool
14993 aarch64_can_inline_p (tree caller, tree callee)
14994 {
14995   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
14996   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
14997
14998   struct cl_target_option *caller_opts
14999         = TREE_TARGET_OPTION (caller_tree ? caller_tree
15000                                            : target_option_default_node);
15001
15002   struct cl_target_option *callee_opts
15003         = TREE_TARGET_OPTION (callee_tree ? callee_tree
15004                                            : target_option_default_node);
15005
15006   /* Callee's ISA flags should be a subset of the caller's.  */
15007   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
15008        != callee_opts->x_aarch64_isa_flags)
15009     return false;
15010
15011   /* Allow non-strict aligned functions inlining into strict
15012      aligned ones.  */
15013   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
15014        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
15015       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
15016            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
15017     return false;
15018
15019   bool always_inline = lookup_attribute ("always_inline",
15020                                           DECL_ATTRIBUTES (callee));
15021
15022   /* If the architectural features match up and the callee is always_inline
15023      then the other attributes don't matter.  */
15024   if (always_inline)
15025     return true;
15026
15027   if (caller_opts->x_aarch64_cmodel_var
15028       != callee_opts->x_aarch64_cmodel_var)
15029     return false;
15030
15031   if (caller_opts->x_aarch64_tls_dialect
15032       != callee_opts->x_aarch64_tls_dialect)
15033     return false;
15034
15035   /* Honour explicit requests to workaround errata.  */
15036   if (!aarch64_tribools_ok_for_inlining_p (
15037           caller_opts->x_aarch64_fix_a53_err835769,
15038           callee_opts->x_aarch64_fix_a53_err835769,
15039           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
15040     return false;
15041
15042   if (!aarch64_tribools_ok_for_inlining_p (
15043           caller_opts->x_aarch64_fix_a53_err843419,
15044           callee_opts->x_aarch64_fix_a53_err843419,
15045           2, TARGET_FIX_ERR_A53_843419))
15046     return false;
15047
15048   /* If the user explicitly specified -momit-leaf-frame-pointer for the
15049      caller and calle and they don't match up, reject inlining.  */
15050   if (!aarch64_tribools_ok_for_inlining_p (
15051           caller_opts->x_flag_omit_leaf_frame_pointer,
15052           callee_opts->x_flag_omit_leaf_frame_pointer,
15053           2, 1))
15054     return false;
15055
15056   /* If the callee has specific tuning overrides, respect them.  */
15057   if (callee_opts->x_aarch64_override_tune_string != NULL
15058       && caller_opts->x_aarch64_override_tune_string == NULL)
15059     return false;
15060
15061   /* If the user specified tuning override strings for the
15062      caller and callee and they don't match up, reject inlining.
15063      We just do a string compare here, we don't analyze the meaning
15064      of the string, as it would be too costly for little gain.  */
15065   if (callee_opts->x_aarch64_override_tune_string
15066       && caller_opts->x_aarch64_override_tune_string
15067       && (strcmp (callee_opts->x_aarch64_override_tune_string,
15068                   caller_opts->x_aarch64_override_tune_string) != 0))
15069     return false;
15070
15071   return true;
15072 }
15073
15074 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
15075    been already.  */
15076
15077 unsigned int
15078 aarch64_tlsdesc_abi_id ()
15079 {
15080   predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
15081   if (!tlsdesc_abi.initialized_p ())
15082     {
15083       HARD_REG_SET full_reg_clobbers;
15084       CLEAR_HARD_REG_SET (full_reg_clobbers);
15085       SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
15086       SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
15087       for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
15088         SET_HARD_REG_BIT (full_reg_clobbers, regno);
15089       tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
15090     }
15091   return tlsdesc_abi.id ();
15092 }
15093
15094 /* Return true if SYMBOL_REF X binds locally.  */
15095
15096 static bool
15097 aarch64_symbol_binds_local_p (const_rtx x)
15098 {
15099   return (SYMBOL_REF_DECL (x)
15100           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
15101           : SYMBOL_REF_LOCAL_P (x));
15102 }
15103
15104 /* Return true if SYMBOL_REF X is thread local */
15105 static bool
15106 aarch64_tls_symbol_p (rtx x)
15107 {
15108   if (! TARGET_HAVE_TLS)
15109     return false;
15110
15111   if (GET_CODE (x) != SYMBOL_REF)
15112     return false;
15113
15114   return SYMBOL_REF_TLS_MODEL (x) != 0;
15115 }
15116
15117 /* Classify a TLS symbol into one of the TLS kinds.  */
15118 enum aarch64_symbol_type
15119 aarch64_classify_tls_symbol (rtx x)
15120 {
15121   enum tls_model tls_kind = tls_symbolic_operand_type (x);
15122
15123   switch (tls_kind)
15124     {
15125     case TLS_MODEL_GLOBAL_DYNAMIC:
15126     case TLS_MODEL_LOCAL_DYNAMIC:
15127       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
15128
15129     case TLS_MODEL_INITIAL_EXEC:
15130       switch (aarch64_cmodel)
15131         {
15132         case AARCH64_CMODEL_TINY:
15133         case AARCH64_CMODEL_TINY_PIC:
15134           return SYMBOL_TINY_TLSIE;
15135         default:
15136           return SYMBOL_SMALL_TLSIE;
15137         }
15138
15139     case TLS_MODEL_LOCAL_EXEC:
15140       if (aarch64_tls_size == 12)
15141         return SYMBOL_TLSLE12;
15142       else if (aarch64_tls_size == 24)
15143         return SYMBOL_TLSLE24;
15144       else if (aarch64_tls_size == 32)
15145         return SYMBOL_TLSLE32;
15146       else if (aarch64_tls_size == 48)
15147         return SYMBOL_TLSLE48;
15148       else
15149         gcc_unreachable ();
15150
15151     case TLS_MODEL_EMULATED:
15152     case TLS_MODEL_NONE:
15153       return SYMBOL_FORCE_TO_MEM;
15154
15155     default:
15156       gcc_unreachable ();
15157     }
15158 }
15159
15160 /* Return the correct method for accessing X + OFFSET, where X is either
15161    a SYMBOL_REF or LABEL_REF.  */
15162
15163 enum aarch64_symbol_type
15164 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
15165 {
15166   if (GET_CODE (x) == LABEL_REF)
15167     {
15168       switch (aarch64_cmodel)
15169         {
15170         case AARCH64_CMODEL_LARGE:
15171           return SYMBOL_FORCE_TO_MEM;
15172
15173         case AARCH64_CMODEL_TINY_PIC:
15174         case AARCH64_CMODEL_TINY:
15175           return SYMBOL_TINY_ABSOLUTE;
15176
15177         case AARCH64_CMODEL_SMALL_SPIC:
15178         case AARCH64_CMODEL_SMALL_PIC:
15179         case AARCH64_CMODEL_SMALL:
15180           return SYMBOL_SMALL_ABSOLUTE;
15181
15182         default:
15183           gcc_unreachable ();
15184         }
15185     }
15186
15187   if (GET_CODE (x) == SYMBOL_REF)
15188     {
15189       if (aarch64_tls_symbol_p (x))
15190         return aarch64_classify_tls_symbol (x);
15191
15192       switch (aarch64_cmodel)
15193         {
15194         case AARCH64_CMODEL_TINY:
15195           /* When we retrieve symbol + offset address, we have to make sure
15196              the offset does not cause overflow of the final address.  But
15197              we have no way of knowing the address of symbol at compile time
15198              so we can't accurately say if the distance between the PC and
15199              symbol + offset is outside the addressible range of +/-1MB in the
15200              TINY code model.  So we limit the maximum offset to +/-64KB and
15201              assume the offset to the symbol is not larger than +/-(1MB - 64KB).
15202              If offset_within_block_p is true we allow larger offsets.
15203              Furthermore force to memory if the symbol is a weak reference to
15204              something that doesn't resolve to a symbol in this module.  */
15205
15206           if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
15207             return SYMBOL_FORCE_TO_MEM;
15208           if (!(IN_RANGE (offset, -0x10000, 0x10000)
15209                 || offset_within_block_p (x, offset)))
15210             return SYMBOL_FORCE_TO_MEM;
15211
15212           return SYMBOL_TINY_ABSOLUTE;
15213
15214         case AARCH64_CMODEL_SMALL:
15215           /* Same reasoning as the tiny code model, but the offset cap here is
15216              1MB, allowing +/-3.9GB for the offset to the symbol.  */
15217
15218           if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
15219             return SYMBOL_FORCE_TO_MEM;
15220           if (!(IN_RANGE (offset, -0x100000, 0x100000)
15221                 || offset_within_block_p (x, offset)))
15222             return SYMBOL_FORCE_TO_MEM;
15223
15224           return SYMBOL_SMALL_ABSOLUTE;
15225
15226         case AARCH64_CMODEL_TINY_PIC:
15227           if (!aarch64_symbol_binds_local_p (x))
15228             return SYMBOL_TINY_GOT;
15229           return SYMBOL_TINY_ABSOLUTE;
15230
15231         case AARCH64_CMODEL_SMALL_SPIC:
15232         case AARCH64_CMODEL_SMALL_PIC:
15233           if (!aarch64_symbol_binds_local_p (x))
15234             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
15235                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
15236           return SYMBOL_SMALL_ABSOLUTE;
15237
15238         case AARCH64_CMODEL_LARGE:
15239           /* This is alright even in PIC code as the constant
15240              pool reference is always PC relative and within
15241              the same translation unit.  */
15242           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
15243             return SYMBOL_SMALL_ABSOLUTE;
15244           else
15245             return SYMBOL_FORCE_TO_MEM;
15246
15247         default:
15248           gcc_unreachable ();
15249         }
15250     }
15251
15252   /* By default push everything into the constant pool.  */
15253   return SYMBOL_FORCE_TO_MEM;
15254 }
15255
15256 bool
15257 aarch64_constant_address_p (rtx x)
15258 {
15259   return (CONSTANT_P (x) && memory_address_p (DImode, x));
15260 }
15261
15262 bool
15263 aarch64_legitimate_pic_operand_p (rtx x)
15264 {
15265   if (GET_CODE (x) == SYMBOL_REF
15266       || (GET_CODE (x) == CONST
15267           && GET_CODE (XEXP (x, 0)) == PLUS
15268           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
15269      return false;
15270
15271   return true;
15272 }
15273
15274 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
15275    that should be rematerialized rather than spilled.  */
15276
15277 static bool
15278 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
15279 {
15280   /* Support CSE and rematerialization of common constants.  */
15281   if (CONST_INT_P (x)
15282       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
15283       || GET_CODE (x) == CONST_VECTOR)
15284     return true;
15285
15286   /* Do not allow vector struct mode constants for Advanced SIMD.
15287      We could support 0 and -1 easily, but they need support in
15288      aarch64-simd.md.  */
15289   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15290   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15291     return false;
15292
15293   /* Only accept variable-length vector constants if they can be
15294      handled directly.
15295
15296      ??? It would be possible to handle rematerialization of other
15297      constants via secondary reloads.  */
15298   if (vec_flags & VEC_ANY_SVE)
15299     return aarch64_simd_valid_immediate (x, NULL);
15300
15301   if (GET_CODE (x) == HIGH)
15302     x = XEXP (x, 0);
15303
15304   /* Accept polynomial constants that can be calculated by using the
15305      destination of a move as the sole temporary.  Constants that
15306      require a second temporary cannot be rematerialized (they can't be
15307      forced to memory and also aren't legitimate constants).  */
15308   poly_int64 offset;
15309   if (poly_int_rtx_p (x, &offset))
15310     return aarch64_offset_temporaries (false, offset) <= 1;
15311
15312   /* If an offset is being added to something else, we need to allow the
15313      base to be moved into the destination register, meaning that there
15314      are no free temporaries for the offset.  */
15315   x = strip_offset (x, &offset);
15316   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
15317     return false;
15318
15319   /* Do not allow const (plus (anchor_symbol, const_int)).  */
15320   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
15321     return false;
15322
15323   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
15324      so spilling them is better than rematerialization.  */
15325   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
15326     return true;
15327
15328   /* Label references are always constant.  */
15329   if (GET_CODE (x) == LABEL_REF)
15330     return true;
15331
15332   return false;
15333 }
15334
15335 rtx
15336 aarch64_load_tp (rtx target)
15337 {
15338   if (!target
15339       || GET_MODE (target) != Pmode
15340       || !register_operand (target, Pmode))
15341     target = gen_reg_rtx (Pmode);
15342
15343   /* Can return in any reg.  */
15344   emit_insn (gen_aarch64_load_tp_hard (target));
15345   return target;
15346 }
15347
15348 /* On AAPCS systems, this is the "struct __va_list".  */
15349 static GTY(()) tree va_list_type;
15350
15351 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
15352    Return the type to use as __builtin_va_list.
15353
15354    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
15355
15356    struct __va_list
15357    {
15358      void *__stack;
15359      void *__gr_top;
15360      void *__vr_top;
15361      int   __gr_offs;
15362      int   __vr_offs;
15363    };  */
15364
15365 static tree
15366 aarch64_build_builtin_va_list (void)
15367 {
15368   tree va_list_name;
15369   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
15370
15371   /* Create the type.  */
15372   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
15373   /* Give it the required name.  */
15374   va_list_name = build_decl (BUILTINS_LOCATION,
15375                              TYPE_DECL,
15376                              get_identifier ("__va_list"),
15377                              va_list_type);
15378   DECL_ARTIFICIAL (va_list_name) = 1;
15379   TYPE_NAME (va_list_type) = va_list_name;
15380   TYPE_STUB_DECL (va_list_type) = va_list_name;
15381
15382   /* Create the fields.  */
15383   f_stack = build_decl (BUILTINS_LOCATION,
15384                         FIELD_DECL, get_identifier ("__stack"),
15385                         ptr_type_node);
15386   f_grtop = build_decl (BUILTINS_LOCATION,
15387                         FIELD_DECL, get_identifier ("__gr_top"),
15388                         ptr_type_node);
15389   f_vrtop = build_decl (BUILTINS_LOCATION,
15390                         FIELD_DECL, get_identifier ("__vr_top"),
15391                         ptr_type_node);
15392   f_groff = build_decl (BUILTINS_LOCATION,
15393                         FIELD_DECL, get_identifier ("__gr_offs"),
15394                         integer_type_node);
15395   f_vroff = build_decl (BUILTINS_LOCATION,
15396                         FIELD_DECL, get_identifier ("__vr_offs"),
15397                         integer_type_node);
15398
15399   /* Tell tree-stdarg pass about our internal offset fields.
15400      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
15401      purpose to identify whether the code is updating va_list internal
15402      offset fields through irregular way.  */
15403   va_list_gpr_counter_field = f_groff;
15404   va_list_fpr_counter_field = f_vroff;
15405
15406   DECL_ARTIFICIAL (f_stack) = 1;
15407   DECL_ARTIFICIAL (f_grtop) = 1;
15408   DECL_ARTIFICIAL (f_vrtop) = 1;
15409   DECL_ARTIFICIAL (f_groff) = 1;
15410   DECL_ARTIFICIAL (f_vroff) = 1;
15411
15412   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
15413   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
15414   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
15415   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
15416   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
15417
15418   TYPE_FIELDS (va_list_type) = f_stack;
15419   DECL_CHAIN (f_stack) = f_grtop;
15420   DECL_CHAIN (f_grtop) = f_vrtop;
15421   DECL_CHAIN (f_vrtop) = f_groff;
15422   DECL_CHAIN (f_groff) = f_vroff;
15423
15424   /* Compute its layout.  */
15425   layout_type (va_list_type);
15426
15427   return va_list_type;
15428 }
15429
15430 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
15431 static void
15432 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
15433 {
15434   const CUMULATIVE_ARGS *cum;
15435   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
15436   tree stack, grtop, vrtop, groff, vroff;
15437   tree t;
15438   int gr_save_area_size = cfun->va_list_gpr_size;
15439   int vr_save_area_size = cfun->va_list_fpr_size;
15440   int vr_offset;
15441
15442   cum = &crtl->args.info;
15443   if (cfun->va_list_gpr_size)
15444     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
15445                              cfun->va_list_gpr_size);
15446   if (cfun->va_list_fpr_size)
15447     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
15448                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
15449
15450   if (!TARGET_FLOAT)
15451     {
15452       gcc_assert (cum->aapcs_nvrn == 0);
15453       vr_save_area_size = 0;
15454     }
15455
15456   f_stack = TYPE_FIELDS (va_list_type_node);
15457   f_grtop = DECL_CHAIN (f_stack);
15458   f_vrtop = DECL_CHAIN (f_grtop);
15459   f_groff = DECL_CHAIN (f_vrtop);
15460   f_vroff = DECL_CHAIN (f_groff);
15461
15462   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
15463                   NULL_TREE);
15464   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
15465                   NULL_TREE);
15466   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
15467                   NULL_TREE);
15468   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
15469                   NULL_TREE);
15470   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
15471                   NULL_TREE);
15472
15473   /* Emit code to initialize STACK, which points to the next varargs stack
15474      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
15475      by named arguments.  STACK is 8-byte aligned.  */
15476   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
15477   if (cum->aapcs_stack_size > 0)
15478     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
15479   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
15480   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15481
15482   /* Emit code to initialize GRTOP, the top of the GR save area.
15483      virtual_incoming_args_rtx should have been 16 byte aligned.  */
15484   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
15485   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
15486   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15487
15488   /* Emit code to initialize VRTOP, the top of the VR save area.
15489      This address is gr_save_area_bytes below GRTOP, rounded
15490      down to the next 16-byte boundary.  */
15491   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
15492   vr_offset = ROUND_UP (gr_save_area_size,
15493                         STACK_BOUNDARY / BITS_PER_UNIT);
15494
15495   if (vr_offset)
15496     t = fold_build_pointer_plus_hwi (t, -vr_offset);
15497   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
15498   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15499
15500   /* Emit code to initialize GROFF, the offset from GRTOP of the
15501      next GPR argument.  */
15502   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
15503               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
15504   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15505
15506   /* Likewise emit code to initialize VROFF, the offset from FTOP
15507      of the next VR argument.  */
15508   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
15509               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
15510   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15511 }
15512
15513 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
15514
15515 static tree
15516 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
15517                               gimple_seq *post_p ATTRIBUTE_UNUSED)
15518 {
15519   tree addr;
15520   bool indirect_p;
15521   bool is_ha;           /* is HFA or HVA.  */
15522   bool dw_align;        /* double-word align.  */
15523   machine_mode ag_mode = VOIDmode;
15524   int nregs;
15525   machine_mode mode;
15526
15527   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
15528   tree stack, f_top, f_off, off, arg, roundup, on_stack;
15529   HOST_WIDE_INT size, rsize, adjust, align;
15530   tree t, u, cond1, cond2;
15531
15532   indirect_p = pass_va_arg_by_reference (type);
15533   if (indirect_p)
15534     type = build_pointer_type (type);
15535
15536   mode = TYPE_MODE (type);
15537
15538   f_stack = TYPE_FIELDS (va_list_type_node);
15539   f_grtop = DECL_CHAIN (f_stack);
15540   f_vrtop = DECL_CHAIN (f_grtop);
15541   f_groff = DECL_CHAIN (f_vrtop);
15542   f_vroff = DECL_CHAIN (f_groff);
15543
15544   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
15545                   f_stack, NULL_TREE);
15546   size = int_size_in_bytes (type);
15547
15548   bool abi_break;
15549   align
15550     = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
15551
15552   dw_align = false;
15553   adjust = 0;
15554   if (aarch64_vfp_is_call_or_return_candidate (mode,
15555                                                type,
15556                                                &ag_mode,
15557                                                &nregs,
15558                                                &is_ha))
15559     {
15560       /* No frontends can create types with variable-sized modes, so we
15561          shouldn't be asked to pass or return them.  */
15562       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
15563
15564       /* TYPE passed in fp/simd registers.  */
15565       if (!TARGET_FLOAT)
15566         aarch64_err_no_fpadvsimd (mode);
15567
15568       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
15569                       unshare_expr (valist), f_vrtop, NULL_TREE);
15570       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
15571                       unshare_expr (valist), f_vroff, NULL_TREE);
15572
15573       rsize = nregs * UNITS_PER_VREG;
15574
15575       if (is_ha)
15576         {
15577           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
15578             adjust = UNITS_PER_VREG - ag_size;
15579         }
15580       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
15581                && size < UNITS_PER_VREG)
15582         {
15583           adjust = UNITS_PER_VREG - size;
15584         }
15585     }
15586   else
15587     {
15588       /* TYPE passed in general registers.  */
15589       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
15590                       unshare_expr (valist), f_grtop, NULL_TREE);
15591       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
15592                       unshare_expr (valist), f_groff, NULL_TREE);
15593       rsize = ROUND_UP (size, UNITS_PER_WORD);
15594       nregs = rsize / UNITS_PER_WORD;
15595
15596       if (align > 8)
15597         {
15598           if (abi_break && warn_psabi)
15599             inform (input_location, "parameter passing for argument of type "
15600                     "%qT changed in GCC 9.1", type);
15601           dw_align = true;
15602         }
15603
15604       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
15605           && size < UNITS_PER_WORD)
15606         {
15607           adjust = UNITS_PER_WORD  - size;
15608         }
15609     }
15610
15611   /* Get a local temporary for the field value.  */
15612   off = get_initialized_tmp_var (f_off, pre_p, NULL);
15613
15614   /* Emit code to branch if off >= 0.  */
15615   t = build2 (GE_EXPR, boolean_type_node, off,
15616               build_int_cst (TREE_TYPE (off), 0));
15617   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
15618
15619   if (dw_align)
15620     {
15621       /* Emit: offs = (offs + 15) & -16.  */
15622       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
15623                   build_int_cst (TREE_TYPE (off), 15));
15624       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
15625                   build_int_cst (TREE_TYPE (off), -16));
15626       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
15627     }
15628   else
15629     roundup = NULL;
15630
15631   /* Update ap.__[g|v]r_offs  */
15632   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
15633               build_int_cst (TREE_TYPE (off), rsize));
15634   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
15635
15636   /* String up.  */
15637   if (roundup)
15638     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
15639
15640   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
15641   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
15642               build_int_cst (TREE_TYPE (f_off), 0));
15643   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
15644
15645   /* String up: make sure the assignment happens before the use.  */
15646   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
15647   COND_EXPR_ELSE (cond1) = t;
15648
15649   /* Prepare the trees handling the argument that is passed on the stack;
15650      the top level node will store in ON_STACK.  */
15651   arg = get_initialized_tmp_var (stack, pre_p, NULL);
15652   if (align > 8)
15653     {
15654       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
15655       t = fold_build_pointer_plus_hwi (arg, 15);
15656       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
15657                   build_int_cst (TREE_TYPE (t), -16));
15658       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
15659     }
15660   else
15661     roundup = NULL;
15662   /* Advance ap.__stack  */
15663   t = fold_build_pointer_plus_hwi (arg, size + 7);
15664   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
15665               build_int_cst (TREE_TYPE (t), -8));
15666   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
15667   /* String up roundup and advance.  */
15668   if (roundup)
15669     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
15670   /* String up with arg */
15671   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
15672   /* Big-endianness related address adjustment.  */
15673   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
15674       && size < UNITS_PER_WORD)
15675   {
15676     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
15677                 size_int (UNITS_PER_WORD - size));
15678     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
15679   }
15680
15681   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
15682   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
15683
15684   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
15685   t = off;
15686   if (adjust)
15687     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
15688                 build_int_cst (TREE_TYPE (off), adjust));
15689
15690   t = fold_convert (sizetype, t);
15691   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
15692
15693   if (is_ha)
15694     {
15695       /* type ha; // treat as "struct {ftype field[n];}"
15696          ... [computing offs]
15697          for (i = 0; i <nregs; ++i, offs += 16)
15698            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
15699          return ha;  */
15700       int i;
15701       tree tmp_ha, field_t, field_ptr_t;
15702
15703       /* Declare a local variable.  */
15704       tmp_ha = create_tmp_var_raw (type, "ha");
15705       gimple_add_tmp_var (tmp_ha);
15706
15707       /* Establish the base type.  */
15708       switch (ag_mode)
15709         {
15710         case E_SFmode:
15711           field_t = float_type_node;
15712           field_ptr_t = float_ptr_type_node;
15713           break;
15714         case E_DFmode:
15715           field_t = double_type_node;
15716           field_ptr_t = double_ptr_type_node;
15717           break;
15718         case E_TFmode:
15719           field_t = long_double_type_node;
15720           field_ptr_t = long_double_ptr_type_node;
15721           break;
15722         case E_HFmode:
15723           field_t = aarch64_fp16_type_node;
15724           field_ptr_t = aarch64_fp16_ptr_type_node;
15725           break;
15726         case E_BFmode:
15727           field_t = aarch64_bf16_type_node;
15728           field_ptr_t = aarch64_bf16_ptr_type_node;
15729           break;
15730         case E_V2SImode:
15731         case E_V4SImode:
15732             {
15733               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
15734               field_t = build_vector_type_for_mode (innertype, ag_mode);
15735               field_ptr_t = build_pointer_type (field_t);
15736             }
15737           break;
15738         default:
15739           gcc_assert (0);
15740         }
15741
15742       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
15743       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
15744       addr = t;
15745       t = fold_convert (field_ptr_t, addr);
15746       t = build2 (MODIFY_EXPR, field_t,
15747                   build1 (INDIRECT_REF, field_t, tmp_ha),
15748                   build1 (INDIRECT_REF, field_t, t));
15749
15750       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
15751       for (i = 1; i < nregs; ++i)
15752         {
15753           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
15754           u = fold_convert (field_ptr_t, addr);
15755           u = build2 (MODIFY_EXPR, field_t,
15756                       build2 (MEM_REF, field_t, tmp_ha,
15757                               build_int_cst (field_ptr_t,
15758                                              (i *
15759                                               int_size_in_bytes (field_t)))),
15760                       build1 (INDIRECT_REF, field_t, u));
15761           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
15762         }
15763
15764       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
15765       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
15766     }
15767
15768   COND_EXPR_ELSE (cond2) = t;
15769   addr = fold_convert (build_pointer_type (type), cond1);
15770   addr = build_va_arg_indirect_ref (addr);
15771
15772   if (indirect_p)
15773     addr = build_va_arg_indirect_ref (addr);
15774
15775   return addr;
15776 }
15777
15778 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
15779
15780 static void
15781 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
15782                                 const function_arg_info &arg,
15783                                 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
15784 {
15785   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
15786   CUMULATIVE_ARGS local_cum;
15787   int gr_saved = cfun->va_list_gpr_size;
15788   int vr_saved = cfun->va_list_fpr_size;
15789
15790   /* The caller has advanced CUM up to, but not beyond, the last named
15791      argument.  Advance a local copy of CUM past the last "real" named
15792      argument, to find out how many registers are left over.  */
15793   local_cum = *cum;
15794   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
15795
15796   /* Found out how many registers we need to save.
15797      Honor tree-stdvar analysis results.  */
15798   if (cfun->va_list_gpr_size)
15799     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
15800                     cfun->va_list_gpr_size / UNITS_PER_WORD);
15801   if (cfun->va_list_fpr_size)
15802     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
15803                     cfun->va_list_fpr_size / UNITS_PER_VREG);
15804
15805   if (!TARGET_FLOAT)
15806     {
15807       gcc_assert (local_cum.aapcs_nvrn == 0);
15808       vr_saved = 0;
15809     }
15810
15811   if (!no_rtl)
15812     {
15813       if (gr_saved > 0)
15814         {
15815           rtx ptr, mem;
15816
15817           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
15818           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
15819                                - gr_saved * UNITS_PER_WORD);
15820           mem = gen_frame_mem (BLKmode, ptr);
15821           set_mem_alias_set (mem, get_varargs_alias_set ());
15822
15823           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
15824                                mem, gr_saved);
15825         }
15826       if (vr_saved > 0)
15827         {
15828           /* We can't use move_block_from_reg, because it will use
15829              the wrong mode, storing D regs only.  */
15830           machine_mode mode = TImode;
15831           int off, i, vr_start;
15832
15833           /* Set OFF to the offset from virtual_incoming_args_rtx of
15834              the first vector register.  The VR save area lies below
15835              the GR one, and is aligned to 16 bytes.  */
15836           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
15837                            STACK_BOUNDARY / BITS_PER_UNIT);
15838           off -= vr_saved * UNITS_PER_VREG;
15839
15840           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
15841           for (i = 0; i < vr_saved; ++i)
15842             {
15843               rtx ptr, mem;
15844
15845               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
15846               mem = gen_frame_mem (mode, ptr);
15847               set_mem_alias_set (mem, get_varargs_alias_set ());
15848               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
15849               off += UNITS_PER_VREG;
15850             }
15851         }
15852     }
15853
15854   /* We don't save the size into *PRETEND_SIZE because we want to avoid
15855      any complication of having crtl->args.pretend_args_size changed.  */
15856   cfun->machine->frame.saved_varargs_size
15857     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
15858                  STACK_BOUNDARY / BITS_PER_UNIT)
15859        + vr_saved * UNITS_PER_VREG);
15860 }
15861
15862 static void
15863 aarch64_conditional_register_usage (void)
15864 {
15865   int i;
15866   if (!TARGET_FLOAT)
15867     {
15868       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
15869         {
15870           fixed_regs[i] = 1;
15871           call_used_regs[i] = 1;
15872         }
15873     }
15874   if (!TARGET_SVE)
15875     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
15876       {
15877         fixed_regs[i] = 1;
15878         call_used_regs[i] = 1;
15879       }
15880
15881   /* Only allow the FFR and FFRT to be accessed via special patterns.  */
15882   CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
15883   CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
15884
15885   /* When tracking speculation, we need a couple of call-clobbered registers
15886      to track the speculation state.  It would be nice to just use
15887      IP0 and IP1, but currently there are numerous places that just
15888      assume these registers are free for other uses (eg pointer
15889      authentication).  */
15890   if (aarch64_track_speculation)
15891     {
15892       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
15893       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
15894       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
15895       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
15896     }
15897 }
15898
15899 /* Walk down the type tree of TYPE counting consecutive base elements.
15900    If *MODEP is VOIDmode, then set it to the first valid floating point
15901    type.  If a non-floating point type is found, or if a floating point
15902    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
15903    otherwise return the count in the sub-tree.  */
15904 static int
15905 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
15906 {
15907   machine_mode mode;
15908   HOST_WIDE_INT size;
15909
15910   /* SVE types (and types containing SVE types) must be handled
15911      before calling this function.  */
15912   gcc_assert (!aarch64_sve::builtin_type_p (type));
15913
15914   switch (TREE_CODE (type))
15915     {
15916     case REAL_TYPE:
15917       mode = TYPE_MODE (type);
15918       if (mode != DFmode && mode != SFmode
15919           && mode != TFmode && mode != HFmode)
15920         return -1;
15921
15922       if (*modep == VOIDmode)
15923         *modep = mode;
15924
15925       if (*modep == mode)
15926         return 1;
15927
15928       break;
15929
15930     case COMPLEX_TYPE:
15931       mode = TYPE_MODE (TREE_TYPE (type));
15932       if (mode != DFmode && mode != SFmode
15933           && mode != TFmode && mode != HFmode)
15934         return -1;
15935
15936       if (*modep == VOIDmode)
15937         *modep = mode;
15938
15939       if (*modep == mode)
15940         return 2;
15941
15942       break;
15943
15944     case VECTOR_TYPE:
15945       /* Use V2SImode and V4SImode as representatives of all 64-bit
15946          and 128-bit vector types.  */
15947       size = int_size_in_bytes (type);
15948       switch (size)
15949         {
15950         case 8:
15951           mode = V2SImode;
15952           break;
15953         case 16:
15954           mode = V4SImode;
15955           break;
15956         default:
15957           return -1;
15958         }
15959
15960       if (*modep == VOIDmode)
15961         *modep = mode;
15962
15963       /* Vector modes are considered to be opaque: two vectors are
15964          equivalent for the purposes of being homogeneous aggregates
15965          if they are the same size.  */
15966       if (*modep == mode)
15967         return 1;
15968
15969       break;
15970
15971     case ARRAY_TYPE:
15972       {
15973         int count;
15974         tree index = TYPE_DOMAIN (type);
15975
15976         /* Can't handle incomplete types nor sizes that are not
15977            fixed.  */
15978         if (!COMPLETE_TYPE_P (type)
15979             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15980           return -1;
15981
15982         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
15983         if (count == -1
15984             || !index
15985             || !TYPE_MAX_VALUE (index)
15986             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
15987             || !TYPE_MIN_VALUE (index)
15988             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
15989             || count < 0)
15990           return -1;
15991
15992         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
15993                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
15994
15995         /* There must be no padding.  */
15996         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
15997                       count * GET_MODE_BITSIZE (*modep)))
15998           return -1;
15999
16000         return count;
16001       }
16002
16003     case RECORD_TYPE:
16004       {
16005         int count = 0;
16006         int sub_count;
16007         tree field;
16008
16009         /* Can't handle incomplete types nor sizes that are not
16010            fixed.  */
16011         if (!COMPLETE_TYPE_P (type)
16012             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
16013           return -1;
16014
16015         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
16016           {
16017             if (TREE_CODE (field) != FIELD_DECL)
16018               continue;
16019
16020             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
16021             if (sub_count < 0)
16022               return -1;
16023             count += sub_count;
16024           }
16025
16026         /* There must be no padding.  */
16027         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
16028                       count * GET_MODE_BITSIZE (*modep)))
16029           return -1;
16030
16031         return count;
16032       }
16033
16034     case UNION_TYPE:
16035     case QUAL_UNION_TYPE:
16036       {
16037         /* These aren't very interesting except in a degenerate case.  */
16038         int count = 0;
16039         int sub_count;
16040         tree field;
16041
16042         /* Can't handle incomplete types nor sizes that are not
16043            fixed.  */
16044         if (!COMPLETE_TYPE_P (type)
16045             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
16046           return -1;
16047
16048         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
16049           {
16050             if (TREE_CODE (field) != FIELD_DECL)
16051               continue;
16052
16053             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
16054             if (sub_count < 0)
16055               return -1;
16056             count = count > sub_count ? count : sub_count;
16057           }
16058
16059         /* There must be no padding.  */
16060         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
16061                       count * GET_MODE_BITSIZE (*modep)))
16062           return -1;
16063
16064         return count;
16065       }
16066
16067     default:
16068       break;
16069     }
16070
16071   return -1;
16072 }
16073
16074 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
16075    type as described in AAPCS64 \S 4.1.2.
16076
16077    See the comment above aarch64_composite_type_p for the notes on MODE.  */
16078
16079 static bool
16080 aarch64_short_vector_p (const_tree type,
16081                         machine_mode mode)
16082 {
16083   poly_int64 size = -1;
16084
16085   if (type && aarch64_sve::builtin_type_p (type))
16086     return false;
16087
16088   if (type && TREE_CODE (type) == VECTOR_TYPE)
16089     size = int_size_in_bytes (type);
16090   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
16091             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
16092     size = GET_MODE_SIZE (mode);
16093
16094   return known_eq (size, 8) || known_eq (size, 16);
16095 }
16096
16097 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
16098    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
16099    array types.  The C99 floating-point complex types are also considered
16100    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
16101    types, which are GCC extensions and out of the scope of AAPCS64, are
16102    treated as composite types here as well.
16103
16104    Note that MODE itself is not sufficient in determining whether a type
16105    is such a composite type or not.  This is because
16106    stor-layout.c:compute_record_mode may have already changed the MODE
16107    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
16108    structure with only one field may have its MODE set to the mode of the
16109    field.  Also an integer mode whose size matches the size of the
16110    RECORD_TYPE type may be used to substitute the original mode
16111    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
16112    solely relied on.  */
16113
16114 static bool
16115 aarch64_composite_type_p (const_tree type,
16116                           machine_mode mode)
16117 {
16118   if (aarch64_short_vector_p (type, mode))
16119     return false;
16120
16121   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
16122     return true;
16123
16124   if (mode == BLKmode
16125       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
16126       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
16127     return true;
16128
16129   return false;
16130 }
16131
16132 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
16133    shall be passed or returned in simd/fp register(s) (providing these
16134    parameter passing registers are available).
16135
16136    Upon successful return, *COUNT returns the number of needed registers,
16137    *BASE_MODE returns the mode of the individual register and when IS_HAF
16138    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
16139    floating-point aggregate or a homogeneous short-vector aggregate.  */
16140
16141 static bool
16142 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
16143                                          const_tree type,
16144                                          machine_mode *base_mode,
16145                                          int *count,
16146                                          bool *is_ha)
16147 {
16148   if (is_ha != NULL) *is_ha = false;
16149
16150   if (type && aarch64_sve::builtin_type_p (type))
16151     return false;
16152
16153   machine_mode new_mode = VOIDmode;
16154   bool composite_p = aarch64_composite_type_p (type, mode);
16155
16156   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
16157       || aarch64_short_vector_p (type, mode))
16158     {
16159       *count = 1;
16160       new_mode = mode;
16161     }
16162   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
16163     {
16164       if (is_ha != NULL) *is_ha = true;
16165       *count = 2;
16166       new_mode = GET_MODE_INNER (mode);
16167     }
16168   else if (type && composite_p)
16169     {
16170       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
16171
16172       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
16173         {
16174           if (is_ha != NULL) *is_ha = true;
16175           *count = ag_count;
16176         }
16177       else
16178         return false;
16179     }
16180   else
16181     return false;
16182
16183   *base_mode = new_mode;
16184   return true;
16185 }
16186
16187 /* Implement TARGET_STRUCT_VALUE_RTX.  */
16188
16189 static rtx
16190 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
16191                           int incoming ATTRIBUTE_UNUSED)
16192 {
16193   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
16194 }
16195
16196 /* Implements target hook vector_mode_supported_p.  */
16197 static bool
16198 aarch64_vector_mode_supported_p (machine_mode mode)
16199 {
16200   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
16201   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
16202 }
16203
16204 /* Return the full-width SVE vector mode for element mode MODE, if one
16205    exists.  */
16206 opt_machine_mode
16207 aarch64_full_sve_mode (scalar_mode mode)
16208 {
16209   switch (mode)
16210     {
16211     case E_DFmode:
16212       return VNx2DFmode;
16213     case E_SFmode:
16214       return VNx4SFmode;
16215     case E_HFmode:
16216       return VNx8HFmode;
16217     case E_BFmode:
16218       return VNx8BFmode;
16219     case E_DImode:
16220       return VNx2DImode;
16221     case E_SImode:
16222       return VNx4SImode;
16223     case E_HImode:
16224       return VNx8HImode;
16225     case E_QImode:
16226       return VNx16QImode;
16227     default:
16228       return opt_machine_mode ();
16229     }
16230 }
16231
16232 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
16233    if it exists.  */
16234 opt_machine_mode
16235 aarch64_vq_mode (scalar_mode mode)
16236 {
16237   switch (mode)
16238     {
16239     case E_DFmode:
16240       return V2DFmode;
16241     case E_SFmode:
16242       return V4SFmode;
16243     case E_HFmode:
16244       return V8HFmode;
16245     case E_BFmode:
16246       return V8BFmode;
16247     case E_SImode:
16248       return V4SImode;
16249     case E_HImode:
16250       return V8HImode;
16251     case E_QImode:
16252       return V16QImode;
16253     case E_DImode:
16254       return V2DImode;
16255     default:
16256       return opt_machine_mode ();
16257     }
16258 }
16259
16260 /* Return appropriate SIMD container
16261    for MODE within a vector of WIDTH bits.  */
16262 static machine_mode
16263 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
16264 {
16265   if (TARGET_SVE
16266       && maybe_ne (width, 128)
16267       && known_eq (width, BITS_PER_SVE_VECTOR))
16268     return aarch64_full_sve_mode (mode).else_mode (word_mode);
16269
16270   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
16271   if (TARGET_SIMD)
16272     {
16273       if (known_eq (width, 128))
16274         return aarch64_vq_mode (mode).else_mode (word_mode);
16275       else
16276         switch (mode)
16277           {
16278           case E_SFmode:
16279             return V2SFmode;
16280           case E_HFmode:
16281             return V4HFmode;
16282           case E_BFmode:
16283             return V4BFmode;
16284           case E_SImode:
16285             return V2SImode;
16286           case E_HImode:
16287             return V4HImode;
16288           case E_QImode:
16289             return V8QImode;
16290           default:
16291             break;
16292           }
16293     }
16294   return word_mode;
16295 }
16296
16297 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
16298 static machine_mode
16299 aarch64_preferred_simd_mode (scalar_mode mode)
16300 {
16301   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
16302   return aarch64_simd_container_mode (mode, bits);
16303 }
16304
16305 /* Return a list of possible vector sizes for the vectorizer
16306    to iterate over.  */
16307 static unsigned int
16308 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
16309 {
16310   static const machine_mode sve_modes[] = {
16311     /* Try using full vectors for all element types.  */
16312     VNx16QImode,
16313
16314     /* Try using 16-bit containers for 8-bit elements and full vectors
16315        for wider elements.  */
16316     VNx8QImode,
16317
16318     /* Try using 32-bit containers for 8-bit and 16-bit elements and
16319        full vectors for wider elements.  */
16320     VNx4QImode,
16321
16322     /* Try using 64-bit containers for all element types.  */
16323     VNx2QImode
16324   };
16325
16326   static const machine_mode advsimd_modes[] = {
16327     /* Try using 128-bit vectors for all element types.  */
16328     V16QImode,
16329
16330     /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
16331        for wider elements.  */
16332     V8QImode,
16333
16334     /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
16335        for wider elements.
16336
16337        TODO: We could support a limited form of V4QImode too, so that
16338        we use 32-bit vectors for 8-bit elements.  */
16339     V4HImode,
16340
16341     /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
16342        for 64-bit elements.
16343
16344        TODO: We could similarly support limited forms of V2QImode and V2HImode
16345        for this case.  */
16346     V2SImode
16347   };
16348
16349   /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
16350      This is because:
16351
16352      - If we can't use N-byte Advanced SIMD vectors then the placement
16353        doesn't matter; we'll just continue as though the Advanced SIMD
16354        entry didn't exist.
16355
16356      - If an SVE main loop with N bytes ends up being cheaper than an
16357        Advanced SIMD main loop with N bytes then by default we'll replace
16358        the Advanced SIMD version with the SVE one.
16359
16360      - If an Advanced SIMD main loop with N bytes ends up being cheaper
16361        than an SVE main loop with N bytes then by default we'll try to
16362        use the SVE loop to vectorize the epilogue instead.  */
16363   unsigned int sve_i = TARGET_SVE ? 0 : ARRAY_SIZE (sve_modes);
16364   unsigned int advsimd_i = 0;
16365   while (advsimd_i < ARRAY_SIZE (advsimd_modes))
16366     {
16367       if (sve_i < ARRAY_SIZE (sve_modes)
16368           && maybe_gt (GET_MODE_NUNITS (sve_modes[sve_i]),
16369                        GET_MODE_NUNITS (advsimd_modes[advsimd_i])))
16370         modes->safe_push (sve_modes[sve_i++]);
16371       else
16372         modes->safe_push (advsimd_modes[advsimd_i++]);
16373     }
16374   while (sve_i < ARRAY_SIZE (sve_modes))
16375     modes->safe_push (sve_modes[sve_i++]);
16376
16377   unsigned int flags = 0;
16378   /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
16379      can compare SVE against Advanced SIMD and so that we can compare
16380      multiple SVE vectorization approaches against each other.  There's
16381      not really any point doing this for Advanced SIMD only, since the
16382      first mode that works should always be the best.  */
16383   if (TARGET_SVE && aarch64_sve_compare_costs)
16384     flags |= VECT_COMPARE_COSTS;
16385   return flags;
16386 }
16387
16388 /* Implement TARGET_MANGLE_TYPE.  */
16389
16390 static const char *
16391 aarch64_mangle_type (const_tree type)
16392 {
16393   /* The AArch64 ABI documents say that "__va_list" has to be
16394      mangled as if it is in the "std" namespace.  */
16395   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
16396     return "St9__va_list";
16397
16398   /* Half-precision floating point types.  */
16399   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
16400     {
16401       if (TYPE_MODE (type) == BFmode)
16402         return "u6__bf16";
16403       else
16404         return "Dh";
16405     }
16406
16407   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
16408      builtin types.  */
16409   if (TYPE_NAME (type) != NULL)
16410     {
16411       const char *res;
16412       if ((res = aarch64_general_mangle_builtin_type (type))
16413           || (res = aarch64_sve::mangle_builtin_type (type)))
16414         return res;
16415     }
16416
16417   /* Use the default mangling.  */
16418   return NULL;
16419 }
16420
16421 /* Implement TARGET_VERIFY_TYPE_CONTEXT.  */
16422
16423 static bool
16424 aarch64_verify_type_context (location_t loc, type_context_kind context,
16425                              const_tree type, bool silent_p)
16426 {
16427   return aarch64_sve::verify_type_context (loc, context, type, silent_p);
16428 }
16429
16430 /* Find the first rtx_insn before insn that will generate an assembly
16431    instruction.  */
16432
16433 static rtx_insn *
16434 aarch64_prev_real_insn (rtx_insn *insn)
16435 {
16436   if (!insn)
16437     return NULL;
16438
16439   do
16440     {
16441       insn = prev_real_insn (insn);
16442     }
16443   while (insn && recog_memoized (insn) < 0);
16444
16445   return insn;
16446 }
16447
16448 static bool
16449 is_madd_op (enum attr_type t1)
16450 {
16451   unsigned int i;
16452   /* A number of these may be AArch32 only.  */
16453   enum attr_type mlatypes[] = {
16454     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
16455     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
16456     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
16457   };
16458
16459   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
16460     {
16461       if (t1 == mlatypes[i])
16462         return true;
16463     }
16464
16465   return false;
16466 }
16467
16468 /* Check if there is a register dependency between a load and the insn
16469    for which we hold recog_data.  */
16470
16471 static bool
16472 dep_between_memop_and_curr (rtx memop)
16473 {
16474   rtx load_reg;
16475   int opno;
16476
16477   gcc_assert (GET_CODE (memop) == SET);
16478
16479   if (!REG_P (SET_DEST (memop)))
16480     return false;
16481
16482   load_reg = SET_DEST (memop);
16483   for (opno = 1; opno < recog_data.n_operands; opno++)
16484     {
16485       rtx operand = recog_data.operand[opno];
16486       if (REG_P (operand)
16487           && reg_overlap_mentioned_p (load_reg, operand))
16488         return true;
16489
16490     }
16491   return false;
16492 }
16493
16494
16495 /* When working around the Cortex-A53 erratum 835769,
16496    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
16497    instruction and has a preceding memory instruction such that a NOP
16498    should be inserted between them.  */
16499
16500 bool
16501 aarch64_madd_needs_nop (rtx_insn* insn)
16502 {
16503   enum attr_type attr_type;
16504   rtx_insn *prev;
16505   rtx body;
16506
16507   if (!TARGET_FIX_ERR_A53_835769)
16508     return false;
16509
16510   if (!INSN_P (insn) || recog_memoized (insn) < 0)
16511     return false;
16512
16513   attr_type = get_attr_type (insn);
16514   if (!is_madd_op (attr_type))
16515     return false;
16516
16517   prev = aarch64_prev_real_insn (insn);
16518   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
16519      Restore recog state to INSN to avoid state corruption.  */
16520   extract_constrain_insn_cached (insn);
16521
16522   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
16523     return false;
16524
16525   body = single_set (prev);
16526
16527   /* If the previous insn is a memory op and there is no dependency between
16528      it and the DImode madd, emit a NOP between them.  If body is NULL then we
16529      have a complex memory operation, probably a load/store pair.
16530      Be conservative for now and emit a NOP.  */
16531   if (GET_MODE (recog_data.operand[0]) == DImode
16532       && (!body || !dep_between_memop_and_curr (body)))
16533     return true;
16534
16535   return false;
16536
16537 }
16538
16539
16540 /* Implement FINAL_PRESCAN_INSN.  */
16541
16542 void
16543 aarch64_final_prescan_insn (rtx_insn *insn)
16544 {
16545   if (aarch64_madd_needs_nop (insn))
16546     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
16547 }
16548
16549
16550 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
16551    instruction.  */
16552
16553 bool
16554 aarch64_sve_index_immediate_p (rtx base_or_step)
16555 {
16556   return (CONST_INT_P (base_or_step)
16557           && IN_RANGE (INTVAL (base_or_step), -16, 15));
16558 }
16559
16560 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
16561    when applied to mode MODE.  Negate X first if NEGATE_P is true.  */
16562
16563 bool
16564 aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
16565 {
16566   rtx elt = unwrap_const_vec_duplicate (x);
16567   if (!CONST_INT_P (elt))
16568     return false;
16569
16570   HOST_WIDE_INT val = INTVAL (elt);
16571   if (negate_p)
16572     val = -val;
16573   val &= GET_MODE_MASK (GET_MODE_INNER (mode));
16574
16575   if (val & 0xff)
16576     return IN_RANGE (val, 0, 0xff);
16577   return IN_RANGE (val, 0, 0xff00);
16578 }
16579
16580 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
16581    instructions when applied to mode MODE.  Negate X first if NEGATE_P
16582    is true.  */
16583
16584 bool
16585 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
16586 {
16587   if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
16588     return false;
16589
16590   /* After the optional negation, the immediate must be nonnegative.
16591      E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
16592      instead of SQADD Zn.B, Zn.B, #129.  */
16593   rtx elt = unwrap_const_vec_duplicate (x);
16594   return negate_p == (INTVAL (elt) < 0);
16595 }
16596
16597 /* Return true if X is a valid immediate operand for an SVE logical
16598    instruction such as AND.  */
16599
16600 bool
16601 aarch64_sve_bitmask_immediate_p (rtx x)
16602 {
16603   rtx elt;
16604
16605   return (const_vec_duplicate_p (x, &elt)
16606           && CONST_INT_P (elt)
16607           && aarch64_bitmask_imm (INTVAL (elt),
16608                                   GET_MODE_INNER (GET_MODE (x))));
16609 }
16610
16611 /* Return true if X is a valid immediate for the SVE DUP and CPY
16612    instructions.  */
16613
16614 bool
16615 aarch64_sve_dup_immediate_p (rtx x)
16616 {
16617   x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
16618   if (!CONST_INT_P (x))
16619     return false;
16620
16621   HOST_WIDE_INT val = INTVAL (x);
16622   if (val & 0xff)
16623     return IN_RANGE (val, -0x80, 0x7f);
16624   return IN_RANGE (val, -0x8000, 0x7f00);
16625 }
16626
16627 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
16628    SIGNED_P says whether the operand is signed rather than unsigned.  */
16629
16630 bool
16631 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
16632 {
16633   x = unwrap_const_vec_duplicate (x);
16634   return (CONST_INT_P (x)
16635           && (signed_p
16636               ? IN_RANGE (INTVAL (x), -16, 15)
16637               : IN_RANGE (INTVAL (x), 0, 127)));
16638 }
16639
16640 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
16641    instruction.  Negate X first if NEGATE_P is true.  */
16642
16643 bool
16644 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
16645 {
16646   rtx elt;
16647   REAL_VALUE_TYPE r;
16648
16649   if (!const_vec_duplicate_p (x, &elt)
16650       || GET_CODE (elt) != CONST_DOUBLE)
16651     return false;
16652
16653   r = *CONST_DOUBLE_REAL_VALUE (elt);
16654
16655   if (negate_p)
16656     r = real_value_negate (&r);
16657
16658   if (real_equal (&r, &dconst1))
16659     return true;
16660   if (real_equal (&r, &dconsthalf))
16661     return true;
16662   return false;
16663 }
16664
16665 /* Return true if X is a valid immediate operand for an SVE FMUL
16666    instruction.  */
16667
16668 bool
16669 aarch64_sve_float_mul_immediate_p (rtx x)
16670 {
16671   rtx elt;
16672
16673   return (const_vec_duplicate_p (x, &elt)
16674           && GET_CODE (elt) == CONST_DOUBLE
16675           && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
16676               || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
16677 }
16678
16679 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
16680    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
16681    is nonnull, use it to describe valid immediates.  */
16682 static bool
16683 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
16684                                     simd_immediate_info *info,
16685                                     enum simd_immediate_check which,
16686                                     simd_immediate_info::insn_type insn)
16687 {
16688   /* Try a 4-byte immediate with LSL.  */
16689   for (unsigned int shift = 0; shift < 32; shift += 8)
16690     if ((val32 & (0xff << shift)) == val32)
16691       {
16692         if (info)
16693           *info = simd_immediate_info (SImode, val32 >> shift, insn,
16694                                        simd_immediate_info::LSL, shift);
16695         return true;
16696       }
16697
16698   /* Try a 2-byte immediate with LSL.  */
16699   unsigned int imm16 = val32 & 0xffff;
16700   if (imm16 == (val32 >> 16))
16701     for (unsigned int shift = 0; shift < 16; shift += 8)
16702       if ((imm16 & (0xff << shift)) == imm16)
16703         {
16704           if (info)
16705             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
16706                                          simd_immediate_info::LSL, shift);
16707           return true;
16708         }
16709
16710   /* Try a 4-byte immediate with MSL, except for cases that MVN
16711      can handle.  */
16712   if (which == AARCH64_CHECK_MOV)
16713     for (unsigned int shift = 8; shift < 24; shift += 8)
16714       {
16715         unsigned int low = (1 << shift) - 1;
16716         if (((val32 & (0xff << shift)) | low) == val32)
16717           {
16718             if (info)
16719               *info = simd_immediate_info (SImode, val32 >> shift, insn,
16720                                            simd_immediate_info::MSL, shift);
16721             return true;
16722           }
16723       }
16724
16725   return false;
16726 }
16727
16728 /* Return true if replicating VAL64 is a valid immediate for the
16729    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
16730    use it to describe valid immediates.  */
16731 static bool
16732 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
16733                                  simd_immediate_info *info,
16734                                  enum simd_immediate_check which)
16735 {
16736   unsigned int val32 = val64 & 0xffffffff;
16737   unsigned int val16 = val64 & 0xffff;
16738   unsigned int val8 = val64 & 0xff;
16739
16740   if (val32 == (val64 >> 32))
16741     {
16742       if ((which & AARCH64_CHECK_ORR) != 0
16743           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
16744                                                  simd_immediate_info::MOV))
16745         return true;
16746
16747       if ((which & AARCH64_CHECK_BIC) != 0
16748           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
16749                                                  simd_immediate_info::MVN))
16750         return true;
16751
16752       /* Try using a replicated byte.  */
16753       if (which == AARCH64_CHECK_MOV
16754           && val16 == (val32 >> 16)
16755           && val8 == (val16 >> 8))
16756         {
16757           if (info)
16758             *info = simd_immediate_info (QImode, val8);
16759           return true;
16760         }
16761     }
16762
16763   /* Try using a bit-to-bytemask.  */
16764   if (which == AARCH64_CHECK_MOV)
16765     {
16766       unsigned int i;
16767       for (i = 0; i < 64; i += 8)
16768         {
16769           unsigned char byte = (val64 >> i) & 0xff;
16770           if (byte != 0 && byte != 0xff)
16771             break;
16772         }
16773       if (i == 64)
16774         {
16775           if (info)
16776             *info = simd_immediate_info (DImode, val64);
16777           return true;
16778         }
16779     }
16780   return false;
16781 }
16782
16783 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
16784    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
16785
16786 static bool
16787 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
16788                              simd_immediate_info *info)
16789 {
16790   scalar_int_mode mode = DImode;
16791   unsigned int val32 = val64 & 0xffffffff;
16792   if (val32 == (val64 >> 32))
16793     {
16794       mode = SImode;
16795       unsigned int val16 = val32 & 0xffff;
16796       if (val16 == (val32 >> 16))
16797         {
16798           mode = HImode;
16799           unsigned int val8 = val16 & 0xff;
16800           if (val8 == (val16 >> 8))
16801             mode = QImode;
16802         }
16803     }
16804   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
16805   if (IN_RANGE (val, -0x80, 0x7f))
16806     {
16807       /* DUP with no shift.  */
16808       if (info)
16809         *info = simd_immediate_info (mode, val);
16810       return true;
16811     }
16812   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
16813     {
16814       /* DUP with LSL #8.  */
16815       if (info)
16816         *info = simd_immediate_info (mode, val);
16817       return true;
16818     }
16819   if (aarch64_bitmask_imm (val64, mode))
16820     {
16821       /* DUPM.  */
16822       if (info)
16823         *info = simd_immediate_info (mode, val);
16824       return true;
16825     }
16826   return false;
16827 }
16828
16829 /* Return true if X is an UNSPEC_PTRUE constant of the form:
16830
16831        (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
16832
16833    where PATTERN is the svpattern as a CONST_INT and where ZERO
16834    is a zero constant of the required PTRUE mode (which can have
16835    fewer elements than X's mode, if zero bits are significant).
16836
16837    If so, and if INFO is nonnull, describe the immediate in INFO.  */
16838 bool
16839 aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
16840 {
16841   if (GET_CODE (x) != CONST)
16842     return false;
16843
16844   x = XEXP (x, 0);
16845   if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
16846     return false;
16847
16848   if (info)
16849     {
16850       aarch64_svpattern pattern
16851         = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
16852       machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
16853       scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
16854       *info = simd_immediate_info (int_mode, pattern);
16855     }
16856   return true;
16857 }
16858
16859 /* Return true if X is a valid SVE predicate.  If INFO is nonnull, use
16860    it to describe valid immediates.  */
16861
16862 static bool
16863 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
16864 {
16865   if (aarch64_sve_ptrue_svpattern_p (x, info))
16866     return true;
16867
16868   if (x == CONST0_RTX (GET_MODE (x)))
16869     {
16870       if (info)
16871         *info = simd_immediate_info (DImode, 0);
16872       return true;
16873     }
16874
16875   /* Analyze the value as a VNx16BImode.  This should be relatively
16876      efficient, since rtx_vector_builder has enough built-in capacity
16877      to store all VLA predicate constants without needing the heap.  */
16878   rtx_vector_builder builder;
16879   if (!aarch64_get_sve_pred_bits (builder, x))
16880     return false;
16881
16882   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
16883   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
16884     {
16885       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
16886       aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
16887       if (pattern != AARCH64_NUM_SVPATTERNS)
16888         {
16889           if (info)
16890             {
16891               scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
16892               *info = simd_immediate_info (int_mode, pattern);
16893             }
16894           return true;
16895         }
16896     }
16897   return false;
16898 }
16899
16900 /* Return true if OP is a valid SIMD immediate for the operation
16901    described by WHICH.  If INFO is nonnull, use it to describe valid
16902    immediates.  */
16903 bool
16904 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
16905                               enum simd_immediate_check which)
16906 {
16907   machine_mode mode = GET_MODE (op);
16908   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
16909   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
16910     return false;
16911
16912   if (vec_flags & VEC_SVE_PRED)
16913     return aarch64_sve_pred_valid_immediate (op, info);
16914
16915   scalar_mode elt_mode = GET_MODE_INNER (mode);
16916   rtx base, step;
16917   unsigned int n_elts;
16918   if (GET_CODE (op) == CONST_VECTOR
16919       && CONST_VECTOR_DUPLICATE_P (op))
16920     n_elts = CONST_VECTOR_NPATTERNS (op);
16921   else if ((vec_flags & VEC_SVE_DATA)
16922            && const_vec_series_p (op, &base, &step))
16923     {
16924       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
16925       if (!aarch64_sve_index_immediate_p (base)
16926           || !aarch64_sve_index_immediate_p (step))
16927         return false;
16928
16929       if (info)
16930         {
16931           /* Get the corresponding container mode.  E.g. an INDEX on V2SI
16932              should yield two integer values per 128-bit block, meaning
16933              that we need to treat it in the same way as V2DI and then
16934              ignore the upper 32 bits of each element.  */
16935           elt_mode = aarch64_sve_container_int_mode (mode);
16936           *info = simd_immediate_info (elt_mode, base, step);
16937         }
16938       return true;
16939     }
16940   else if (GET_CODE (op) == CONST_VECTOR
16941            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
16942     /* N_ELTS set above.  */;
16943   else
16944     return false;
16945
16946   scalar_float_mode elt_float_mode;
16947   if (n_elts == 1
16948       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
16949     {
16950       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
16951       if (aarch64_float_const_zero_rtx_p (elt)
16952           || aarch64_float_const_representable_p (elt))
16953         {
16954           if (info)
16955             *info = simd_immediate_info (elt_float_mode, elt);
16956           return true;
16957         }
16958     }
16959
16960   /* If all elements in an SVE vector have the same value, we have a free
16961      choice between using the element mode and using the container mode.
16962      Using the element mode means that unused parts of the vector are
16963      duplicates of the used elements, while using the container mode means
16964      that the unused parts are an extension of the used elements.  Using the
16965      element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
16966      for its container mode VNx4SI while 0x00000101 isn't.
16967
16968      If not all elements in an SVE vector have the same value, we need the
16969      transition from one element to the next to occur at container boundaries.
16970      E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
16971      in the same way as a VNx4SI containing { 1, 2, 3, 4 }.  */
16972   scalar_int_mode elt_int_mode;
16973   if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
16974     elt_int_mode = aarch64_sve_container_int_mode (mode);
16975   else
16976     elt_int_mode = int_mode_for_mode (elt_mode).require ();
16977
16978   unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
16979   if (elt_size > 8)
16980     return false;
16981
16982   /* Expand the vector constant out into a byte vector, with the least
16983      significant byte of the register first.  */
16984   auto_vec<unsigned char, 16> bytes;
16985   bytes.reserve (n_elts * elt_size);
16986   for (unsigned int i = 0; i < n_elts; i++)
16987     {
16988       /* The vector is provided in gcc endian-neutral fashion.
16989          For aarch64_be Advanced SIMD, it must be laid out in the vector
16990          register in reverse order.  */
16991       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
16992       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
16993
16994       if (elt_mode != elt_int_mode)
16995         elt = gen_lowpart (elt_int_mode, elt);
16996
16997       if (!CONST_INT_P (elt))
16998         return false;
16999
17000       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
17001       for (unsigned int byte = 0; byte < elt_size; byte++)
17002         {
17003           bytes.quick_push (elt_val & 0xff);
17004           elt_val >>= BITS_PER_UNIT;
17005         }
17006     }
17007
17008   /* The immediate must repeat every eight bytes.  */
17009   unsigned int nbytes = bytes.length ();
17010   for (unsigned i = 8; i < nbytes; ++i)
17011     if (bytes[i] != bytes[i - 8])
17012       return false;
17013
17014   /* Get the repeating 8-byte value as an integer.  No endian correction
17015      is needed here because bytes is already in lsb-first order.  */
17016   unsigned HOST_WIDE_INT val64 = 0;
17017   for (unsigned int i = 0; i < 8; i++)
17018     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
17019               << (i * BITS_PER_UNIT));
17020
17021   if (vec_flags & VEC_SVE_DATA)
17022     return aarch64_sve_valid_immediate (val64, info);
17023   else
17024     return aarch64_advsimd_valid_immediate (val64, info, which);
17025 }
17026
17027 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
17028    has a step in the range of INDEX.  Return the index expression if so,
17029    otherwise return null.  */
17030 rtx
17031 aarch64_check_zero_based_sve_index_immediate (rtx x)
17032 {
17033   rtx base, step;
17034   if (const_vec_series_p (x, &base, &step)
17035       && base == const0_rtx
17036       && aarch64_sve_index_immediate_p (step))
17037     return step;
17038   return NULL_RTX;
17039 }
17040
17041 /* Check of immediate shift constants are within range.  */
17042 bool
17043 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
17044 {
17045   x = unwrap_const_vec_duplicate (x);
17046   if (!CONST_INT_P (x))
17047     return false;
17048   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
17049   if (left)
17050     return IN_RANGE (INTVAL (x), 0, bit_width - 1);
17051   else
17052     return IN_RANGE (INTVAL (x), 1, bit_width);
17053 }
17054
17055 /* Return the bitmask CONST_INT to select the bits required by a zero extract
17056    operation of width WIDTH at bit position POS.  */
17057
17058 rtx
17059 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
17060 {
17061   gcc_assert (CONST_INT_P (width));
17062   gcc_assert (CONST_INT_P (pos));
17063
17064   unsigned HOST_WIDE_INT mask
17065     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
17066   return GEN_INT (mask << UINTVAL (pos));
17067 }
17068
17069 bool
17070 aarch64_mov_operand_p (rtx x, machine_mode mode)
17071 {
17072   if (GET_CODE (x) == HIGH
17073       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
17074     return true;
17075
17076   if (CONST_INT_P (x))
17077     return true;
17078
17079   if (VECTOR_MODE_P (GET_MODE (x)))
17080     {
17081       /* Require predicate constants to be VNx16BI before RA, so that we
17082          force everything to have a canonical form.  */
17083       if (!lra_in_progress
17084           && !reload_completed
17085           && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
17086           && GET_MODE (x) != VNx16BImode)
17087         return false;
17088
17089       return aarch64_simd_valid_immediate (x, NULL);
17090     }
17091
17092   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
17093     return true;
17094
17095   if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x))
17096     return true;
17097
17098   return aarch64_classify_symbolic_expression (x)
17099     == SYMBOL_TINY_ABSOLUTE;
17100 }
17101
17102 /* Return a const_int vector of VAL.  */
17103 rtx
17104 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
17105 {
17106   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
17107   return gen_const_vec_duplicate (mode, c);
17108 }
17109
17110 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
17111
17112 bool
17113 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
17114 {
17115   machine_mode vmode;
17116
17117   vmode = aarch64_simd_container_mode (mode, 64);
17118   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
17119   return aarch64_simd_valid_immediate (op_v, NULL);
17120 }
17121
17122 /* Construct and return a PARALLEL RTX vector with elements numbering the
17123    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
17124    the vector - from the perspective of the architecture.  This does not
17125    line up with GCC's perspective on lane numbers, so we end up with
17126    different masks depending on our target endian-ness.  The diagram
17127    below may help.  We must draw the distinction when building masks
17128    which select one half of the vector.  An instruction selecting
17129    architectural low-lanes for a big-endian target, must be described using
17130    a mask selecting GCC high-lanes.
17131
17132                  Big-Endian             Little-Endian
17133
17134 GCC             0   1   2   3           3   2   1   0
17135               | x | x | x | x |       | x | x | x | x |
17136 Architecture    3   2   1   0           3   2   1   0
17137
17138 Low Mask:         { 2, 3 }                { 0, 1 }
17139 High Mask:        { 0, 1 }                { 2, 3 }
17140
17141    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
17142
17143 rtx
17144 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
17145 {
17146   rtvec v = rtvec_alloc (nunits / 2);
17147   int high_base = nunits / 2;
17148   int low_base = 0;
17149   int base;
17150   rtx t1;
17151   int i;
17152
17153   if (BYTES_BIG_ENDIAN)
17154     base = high ? low_base : high_base;
17155   else
17156     base = high ? high_base : low_base;
17157
17158   for (i = 0; i < nunits / 2; i++)
17159     RTVEC_ELT (v, i) = GEN_INT (base + i);
17160
17161   t1 = gen_rtx_PARALLEL (mode, v);
17162   return t1;
17163 }
17164
17165 /* Check OP for validity as a PARALLEL RTX vector with elements
17166    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
17167    from the perspective of the architecture.  See the diagram above
17168    aarch64_simd_vect_par_cnst_half for more details.  */
17169
17170 bool
17171 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
17172                                        bool high)
17173 {
17174   int nelts;
17175   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
17176     return false;
17177
17178   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
17179   HOST_WIDE_INT count_op = XVECLEN (op, 0);
17180   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
17181   int i = 0;
17182
17183   if (count_op != count_ideal)
17184     return false;
17185
17186   for (i = 0; i < count_ideal; i++)
17187     {
17188       rtx elt_op = XVECEXP (op, 0, i);
17189       rtx elt_ideal = XVECEXP (ideal, 0, i);
17190
17191       if (!CONST_INT_P (elt_op)
17192           || INTVAL (elt_ideal) != INTVAL (elt_op))
17193         return false;
17194     }
17195   return true;
17196 }
17197
17198 /* Return a PARALLEL containing NELTS elements, with element I equal
17199    to BASE + I * STEP.  */
17200
17201 rtx
17202 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
17203 {
17204   rtvec vec = rtvec_alloc (nelts);
17205   for (unsigned int i = 0; i < nelts; ++i)
17206     RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
17207   return gen_rtx_PARALLEL (VOIDmode, vec);
17208 }
17209
17210 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
17211    series with step STEP.  */
17212
17213 bool
17214 aarch64_stepped_int_parallel_p (rtx op, int step)
17215 {
17216   if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
17217     return false;
17218
17219   unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
17220   for (int i = 1; i < XVECLEN (op, 0); ++i)
17221     if (!CONST_INT_P (XVECEXP (op, 0, i))
17222         || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
17223       return false;
17224
17225   return true;
17226 }
17227
17228 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
17229    HIGH (exclusive).  */
17230 void
17231 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
17232                           const_tree exp)
17233 {
17234   HOST_WIDE_INT lane;
17235   gcc_assert (CONST_INT_P (operand));
17236   lane = INTVAL (operand);
17237
17238   if (lane < low || lane >= high)
17239   {
17240     if (exp)
17241       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
17242     else
17243       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
17244   }
17245 }
17246
17247 /* Peform endian correction on lane number N, which indexes a vector
17248    of mode MODE, and return the result as an SImode rtx.  */
17249
17250 rtx
17251 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
17252 {
17253   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
17254 }
17255
17256 /* Return TRUE if OP is a valid vector addressing mode.  */
17257
17258 bool
17259 aarch64_simd_mem_operand_p (rtx op)
17260 {
17261   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
17262                         || REG_P (XEXP (op, 0)));
17263 }
17264
17265 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
17266
17267 bool
17268 aarch64_sve_ld1r_operand_p (rtx op)
17269 {
17270   struct aarch64_address_info addr;
17271   scalar_mode mode;
17272
17273   return (MEM_P (op)
17274           && is_a <scalar_mode> (GET_MODE (op), &mode)
17275           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
17276           && addr.type == ADDRESS_REG_IMM
17277           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
17278 }
17279
17280 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
17281    where the size of the read data is specified by `mode` and the size of the
17282    vector elements are specified by `elem_mode`.   */
17283 bool
17284 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
17285                                    scalar_mode elem_mode)
17286 {
17287   struct aarch64_address_info addr;
17288   if (!MEM_P (op)
17289       || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
17290     return false;
17291
17292   if (addr.type == ADDRESS_REG_IMM)
17293     return offset_4bit_signed_scaled_p (mode, addr.const_offset);
17294
17295   if (addr.type == ADDRESS_REG_REG)
17296     return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
17297
17298   return false;
17299 }
17300
17301 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
17302 bool
17303 aarch64_sve_ld1rq_operand_p (rtx op)
17304 {
17305   return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
17306                                             GET_MODE_INNER (GET_MODE (op)));
17307 }
17308
17309 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
17310    accessing a vector where the element size is specified by `elem_mode`.  */
17311 bool
17312 aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
17313 {
17314   return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
17315 }
17316
17317 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction.  */
17318 bool
17319 aarch64_sve_ldff1_operand_p (rtx op)
17320 {
17321   if (!MEM_P (op))
17322     return false;
17323
17324   struct aarch64_address_info addr;
17325   if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
17326     return false;
17327
17328   if (addr.type == ADDRESS_REG_IMM)
17329     return known_eq (addr.const_offset, 0);
17330
17331   return addr.type == ADDRESS_REG_REG;
17332 }
17333
17334 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction.  */
17335 bool
17336 aarch64_sve_ldnf1_operand_p (rtx op)
17337 {
17338   struct aarch64_address_info addr;
17339
17340   return (MEM_P (op)
17341           && aarch64_classify_address (&addr, XEXP (op, 0),
17342                                        GET_MODE (op), false)
17343           && addr.type == ADDRESS_REG_IMM);
17344 }
17345
17346 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
17347    The conditions for STR are the same.  */
17348 bool
17349 aarch64_sve_ldr_operand_p (rtx op)
17350 {
17351   struct aarch64_address_info addr;
17352
17353   return (MEM_P (op)
17354           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
17355                                        false, ADDR_QUERY_ANY)
17356           && addr.type == ADDRESS_REG_IMM);
17357 }
17358
17359 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
17360    addressing memory of mode MODE.  */
17361 bool
17362 aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
17363 {
17364   struct aarch64_address_info addr;
17365   if (!aarch64_classify_address (&addr, op, mode, false))
17366     return false;
17367
17368   if (addr.type == ADDRESS_REG_IMM)
17369     return known_eq (addr.const_offset, 0);
17370
17371   return addr.type == ADDRESS_REG_REG;
17372 }
17373
17374 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
17375    We need to be able to access the individual pieces, so the range
17376    is different from LD[234] and ST[234].  */
17377 bool
17378 aarch64_sve_struct_memory_operand_p (rtx op)
17379 {
17380   if (!MEM_P (op))
17381     return false;
17382
17383   machine_mode mode = GET_MODE (op);
17384   struct aarch64_address_info addr;
17385   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
17386                                  ADDR_QUERY_ANY)
17387       || addr.type != ADDRESS_REG_IMM)
17388     return false;
17389
17390   poly_int64 first = addr.const_offset;
17391   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
17392   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
17393           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
17394 }
17395
17396 /* Emit a register copy from operand to operand, taking care not to
17397    early-clobber source registers in the process.
17398
17399    COUNT is the number of components into which the copy needs to be
17400    decomposed.  */
17401 void
17402 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
17403                                 unsigned int count)
17404 {
17405   unsigned int i;
17406   int rdest = REGNO (operands[0]);
17407   int rsrc = REGNO (operands[1]);
17408
17409   if (!reg_overlap_mentioned_p (operands[0], operands[1])
17410       || rdest < rsrc)
17411     for (i = 0; i < count; i++)
17412       emit_move_insn (gen_rtx_REG (mode, rdest + i),
17413                       gen_rtx_REG (mode, rsrc + i));
17414   else
17415     for (i = 0; i < count; i++)
17416       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
17417                       gen_rtx_REG (mode, rsrc + count - i - 1));
17418 }
17419
17420 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
17421    one of VSTRUCT modes: OI, CI, or XI.  */
17422 int
17423 aarch64_simd_attr_length_rglist (machine_mode mode)
17424 {
17425   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
17426   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
17427 }
17428
17429 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
17430    alignment of a vector to 128 bits.  SVE predicates have an alignment of
17431    16 bits.  */
17432 static HOST_WIDE_INT
17433 aarch64_simd_vector_alignment (const_tree type)
17434 {
17435   /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
17436      be set for non-predicate vectors of booleans.  Modes are the most
17437      direct way we have of identifying real SVE predicate types.  */
17438   if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
17439     return 16;
17440   widest_int min_size
17441     = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
17442   return wi::umin (min_size, 128).to_uhwi ();
17443 }
17444
17445 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
17446 static poly_uint64
17447 aarch64_vectorize_preferred_vector_alignment (const_tree type)
17448 {
17449   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
17450     {
17451       /* If the length of the vector is fixed, try to align to that length,
17452          otherwise don't try to align at all.  */
17453       HOST_WIDE_INT result;
17454       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
17455         result = TYPE_ALIGN (TREE_TYPE (type));
17456       return result;
17457     }
17458   return TYPE_ALIGN (type);
17459 }
17460
17461 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
17462 static bool
17463 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
17464 {
17465   if (is_packed)
17466     return false;
17467
17468   /* For fixed-length vectors, check that the vectorizer will aim for
17469      full-vector alignment.  This isn't true for generic GCC vectors
17470      that are wider than the ABI maximum of 128 bits.  */
17471   poly_uint64 preferred_alignment =
17472     aarch64_vectorize_preferred_vector_alignment (type);
17473   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
17474       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
17475                    preferred_alignment))
17476     return false;
17477
17478   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
17479   return true;
17480 }
17481
17482 /* Return true if the vector misalignment factor is supported by the
17483    target.  */
17484 static bool
17485 aarch64_builtin_support_vector_misalignment (machine_mode mode,
17486                                              const_tree type, int misalignment,
17487                                              bool is_packed)
17488 {
17489   if (TARGET_SIMD && STRICT_ALIGNMENT)
17490     {
17491       /* Return if movmisalign pattern is not supported for this mode.  */
17492       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
17493         return false;
17494
17495       /* Misalignment factor is unknown at compile time.  */
17496       if (misalignment == -1)
17497         return false;
17498     }
17499   return default_builtin_support_vector_misalignment (mode, type, misalignment,
17500                                                       is_packed);
17501 }
17502
17503 /* If VALS is a vector constant that can be loaded into a register
17504    using DUP, generate instructions to do so and return an RTX to
17505    assign to the register.  Otherwise return NULL_RTX.  */
17506 static rtx
17507 aarch64_simd_dup_constant (rtx vals)
17508 {
17509   machine_mode mode = GET_MODE (vals);
17510   machine_mode inner_mode = GET_MODE_INNER (mode);
17511   rtx x;
17512
17513   if (!const_vec_duplicate_p (vals, &x))
17514     return NULL_RTX;
17515
17516   /* We can load this constant by using DUP and a constant in a
17517      single ARM register.  This will be cheaper than a vector
17518      load.  */
17519   x = copy_to_mode_reg (inner_mode, x);
17520   return gen_vec_duplicate (mode, x);
17521 }
17522
17523
17524 /* Generate code to load VALS, which is a PARALLEL containing only
17525    constants (for vec_init) or CONST_VECTOR, efficiently into a
17526    register.  Returns an RTX to copy into the register, or NULL_RTX
17527    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
17528 static rtx
17529 aarch64_simd_make_constant (rtx vals)
17530 {
17531   machine_mode mode = GET_MODE (vals);
17532   rtx const_dup;
17533   rtx const_vec = NULL_RTX;
17534   int n_const = 0;
17535   int i;
17536
17537   if (GET_CODE (vals) == CONST_VECTOR)
17538     const_vec = vals;
17539   else if (GET_CODE (vals) == PARALLEL)
17540     {
17541       /* A CONST_VECTOR must contain only CONST_INTs and
17542          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
17543          Only store valid constants in a CONST_VECTOR.  */
17544       int n_elts = XVECLEN (vals, 0);
17545       for (i = 0; i < n_elts; ++i)
17546         {
17547           rtx x = XVECEXP (vals, 0, i);
17548           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
17549             n_const++;
17550         }
17551       if (n_const == n_elts)
17552         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
17553     }
17554   else
17555     gcc_unreachable ();
17556
17557   if (const_vec != NULL_RTX
17558       && aarch64_simd_valid_immediate (const_vec, NULL))
17559     /* Load using MOVI/MVNI.  */
17560     return const_vec;
17561   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
17562     /* Loaded using DUP.  */
17563     return const_dup;
17564   else if (const_vec != NULL_RTX)
17565     /* Load from constant pool. We cannot take advantage of single-cycle
17566        LD1 because we need a PC-relative addressing mode.  */
17567     return const_vec;
17568   else
17569     /* A PARALLEL containing something not valid inside CONST_VECTOR.
17570        We cannot construct an initializer.  */
17571     return NULL_RTX;
17572 }
17573
17574 /* Expand a vector initialisation sequence, such that TARGET is
17575    initialised to contain VALS.  */
17576
17577 void
17578 aarch64_expand_vector_init (rtx target, rtx vals)
17579 {
17580   machine_mode mode = GET_MODE (target);
17581   scalar_mode inner_mode = GET_MODE_INNER (mode);
17582   /* The number of vector elements.  */
17583   int n_elts = XVECLEN (vals, 0);
17584   /* The number of vector elements which are not constant.  */
17585   int n_var = 0;
17586   rtx any_const = NULL_RTX;
17587   /* The first element of vals.  */
17588   rtx v0 = XVECEXP (vals, 0, 0);
17589   bool all_same = true;
17590
17591   /* This is a special vec_init<M><N> where N is not an element mode but a
17592      vector mode with half the elements of M.  We expect to find two entries
17593      of mode N in VALS and we must put their concatentation into TARGET.  */
17594   if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
17595     {
17596       gcc_assert (known_eq (GET_MODE_SIZE (mode),
17597                   2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
17598       rtx lo = XVECEXP (vals, 0, 0);
17599       rtx hi = XVECEXP (vals, 0, 1);
17600       machine_mode narrow_mode = GET_MODE (lo);
17601       gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
17602       gcc_assert (narrow_mode == GET_MODE (hi));
17603
17604       /* When we want to concatenate a half-width vector with zeroes we can
17605          use the aarch64_combinez[_be] patterns.  Just make sure that the
17606          zeroes are in the right half.  */
17607       if (BYTES_BIG_ENDIAN
17608           && aarch64_simd_imm_zero (lo, narrow_mode)
17609           && general_operand (hi, narrow_mode))
17610         emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
17611       else if (!BYTES_BIG_ENDIAN
17612                && aarch64_simd_imm_zero (hi, narrow_mode)
17613                && general_operand (lo, narrow_mode))
17614         emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
17615       else
17616         {
17617           /* Else create the two half-width registers and combine them.  */
17618           if (!REG_P (lo))
17619             lo = force_reg (GET_MODE (lo), lo);
17620           if (!REG_P (hi))
17621             hi = force_reg (GET_MODE (hi), hi);
17622
17623           if (BYTES_BIG_ENDIAN)
17624             std::swap (lo, hi);
17625           emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
17626         }
17627      return;
17628    }
17629
17630   /* Count the number of variable elements to initialise.  */
17631   for (int i = 0; i < n_elts; ++i)
17632     {
17633       rtx x = XVECEXP (vals, 0, i);
17634       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
17635         ++n_var;
17636       else
17637         any_const = x;
17638
17639       all_same &= rtx_equal_p (x, v0);
17640     }
17641
17642   /* No variable elements, hand off to aarch64_simd_make_constant which knows
17643      how best to handle this.  */
17644   if (n_var == 0)
17645     {
17646       rtx constant = aarch64_simd_make_constant (vals);
17647       if (constant != NULL_RTX)
17648         {
17649           emit_move_insn (target, constant);
17650           return;
17651         }
17652     }
17653
17654   /* Splat a single non-constant element if we can.  */
17655   if (all_same)
17656     {
17657       rtx x = copy_to_mode_reg (inner_mode, v0);
17658       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
17659       return;
17660     }
17661
17662   enum insn_code icode = optab_handler (vec_set_optab, mode);
17663   gcc_assert (icode != CODE_FOR_nothing);
17664
17665   /* If there are only variable elements, try to optimize
17666      the insertion using dup for the most common element
17667      followed by insertions.  */
17668
17669   /* The algorithm will fill matches[*][0] with the earliest matching element,
17670      and matches[X][1] with the count of duplicate elements (if X is the
17671      earliest element which has duplicates).  */
17672
17673   if (n_var == n_elts && n_elts <= 16)
17674     {
17675       int matches[16][2] = {0};
17676       for (int i = 0; i < n_elts; i++)
17677         {
17678           for (int j = 0; j <= i; j++)
17679             {
17680               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
17681                 {
17682                   matches[i][0] = j;
17683                   matches[j][1]++;
17684                   break;
17685                 }
17686             }
17687         }
17688       int maxelement = 0;
17689       int maxv = 0;
17690       for (int i = 0; i < n_elts; i++)
17691         if (matches[i][1] > maxv)
17692           {
17693             maxelement = i;
17694             maxv = matches[i][1];
17695           }
17696
17697       /* Create a duplicate of the most common element, unless all elements
17698          are equally useless to us, in which case just immediately set the
17699          vector register using the first element.  */
17700
17701       if (maxv == 1)
17702         {
17703           /* For vectors of two 64-bit elements, we can do even better.  */
17704           if (n_elts == 2
17705               && (inner_mode == E_DImode
17706                   || inner_mode == E_DFmode))
17707
17708             {
17709               rtx x0 = XVECEXP (vals, 0, 0);
17710               rtx x1 = XVECEXP (vals, 0, 1);
17711               /* Combine can pick up this case, but handling it directly
17712                  here leaves clearer RTL.
17713
17714                  This is load_pair_lanes<mode>, and also gives us a clean-up
17715                  for store_pair_lanes<mode>.  */
17716               if (memory_operand (x0, inner_mode)
17717                   && memory_operand (x1, inner_mode)
17718                   && !STRICT_ALIGNMENT
17719                   && rtx_equal_p (XEXP (x1, 0),
17720                                   plus_constant (Pmode,
17721                                                  XEXP (x0, 0),
17722                                                  GET_MODE_SIZE (inner_mode))))
17723                 {
17724                   rtx t;
17725                   if (inner_mode == DFmode)
17726                     t = gen_load_pair_lanesdf (target, x0, x1);
17727                   else
17728                     t = gen_load_pair_lanesdi (target, x0, x1);
17729                   emit_insn (t);
17730                   return;
17731                 }
17732             }
17733           /* The subreg-move sequence below will move into lane zero of the
17734              vector register.  For big-endian we want that position to hold
17735              the last element of VALS.  */
17736           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
17737           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
17738           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
17739         }
17740       else
17741         {
17742           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
17743           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
17744         }
17745
17746       /* Insert the rest.  */
17747       for (int i = 0; i < n_elts; i++)
17748         {
17749           rtx x = XVECEXP (vals, 0, i);
17750           if (matches[i][0] == maxelement)
17751             continue;
17752           x = copy_to_mode_reg (inner_mode, x);
17753           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
17754         }
17755       return;
17756     }
17757
17758   /* Initialise a vector which is part-variable.  We want to first try
17759      to build those lanes which are constant in the most efficient way we
17760      can.  */
17761   if (n_var != n_elts)
17762     {
17763       rtx copy = copy_rtx (vals);
17764
17765       /* Load constant part of vector.  We really don't care what goes into the
17766          parts we will overwrite, but we're more likely to be able to load the
17767          constant efficiently if it has fewer, larger, repeating parts
17768          (see aarch64_simd_valid_immediate).  */
17769       for (int i = 0; i < n_elts; i++)
17770         {
17771           rtx x = XVECEXP (vals, 0, i);
17772           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
17773             continue;
17774           rtx subst = any_const;
17775           for (int bit = n_elts / 2; bit > 0; bit /= 2)
17776             {
17777               /* Look in the copied vector, as more elements are const.  */
17778               rtx test = XVECEXP (copy, 0, i ^ bit);
17779               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
17780                 {
17781                   subst = test;
17782                   break;
17783                 }
17784             }
17785           XVECEXP (copy, 0, i) = subst;
17786         }
17787       aarch64_expand_vector_init (target, copy);
17788     }
17789
17790   /* Insert the variable lanes directly.  */
17791   for (int i = 0; i < n_elts; i++)
17792     {
17793       rtx x = XVECEXP (vals, 0, i);
17794       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
17795         continue;
17796       x = copy_to_mode_reg (inner_mode, x);
17797       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
17798     }
17799 }
17800
17801 /* Emit RTL corresponding to:
17802    insr TARGET, ELEM.  */
17803
17804 static void
17805 emit_insr (rtx target, rtx elem)
17806 {
17807   machine_mode mode = GET_MODE (target);
17808   scalar_mode elem_mode = GET_MODE_INNER (mode);
17809   elem = force_reg (elem_mode, elem);
17810
17811   insn_code icode = optab_handler (vec_shl_insert_optab, mode);
17812   gcc_assert (icode != CODE_FOR_nothing);
17813   emit_insn (GEN_FCN (icode) (target, target, elem));
17814 }
17815
17816 /* Subroutine of aarch64_sve_expand_vector_init for handling
17817    trailing constants.
17818    This function works as follows:
17819    (a) Create a new vector consisting of trailing constants.
17820    (b) Initialize TARGET with the constant vector using emit_move_insn.
17821    (c) Insert remaining elements in TARGET using insr.
17822    NELTS is the total number of elements in original vector while
17823    while NELTS_REQD is the number of elements that are actually
17824    significant.
17825
17826    ??? The heuristic used is to do above only if number of constants
17827    is at least half the total number of elements.  May need fine tuning.  */
17828
17829 static bool
17830 aarch64_sve_expand_vector_init_handle_trailing_constants
17831  (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
17832 {
17833   machine_mode mode = GET_MODE (target);
17834   scalar_mode elem_mode = GET_MODE_INNER (mode);
17835   int n_trailing_constants = 0;
17836
17837   for (int i = nelts_reqd - 1;
17838        i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
17839        i--)
17840     n_trailing_constants++;
17841
17842   if (n_trailing_constants >= nelts_reqd / 2)
17843     {
17844       rtx_vector_builder v (mode, 1, nelts);
17845       for (int i = 0; i < nelts; i++)
17846         v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
17847       rtx const_vec = v.build ();
17848       emit_move_insn (target, const_vec);
17849
17850       for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
17851         emit_insr (target, builder.elt (i));
17852
17853       return true;
17854     }
17855
17856   return false;
17857 }
17858
17859 /* Subroutine of aarch64_sve_expand_vector_init.
17860    Works as follows:
17861    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
17862    (b) Skip trailing elements from BUILDER, which are the same as
17863        element NELTS_REQD - 1.
17864    (c) Insert earlier elements in reverse order in TARGET using insr.  */
17865
17866 static void
17867 aarch64_sve_expand_vector_init_insert_elems (rtx target,
17868                                              const rtx_vector_builder &builder,
17869                                              int nelts_reqd)
17870 {
17871   machine_mode mode = GET_MODE (target);
17872   scalar_mode elem_mode = GET_MODE_INNER (mode);
17873
17874   struct expand_operand ops[2];
17875   enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
17876   gcc_assert (icode != CODE_FOR_nothing);
17877
17878   create_output_operand (&ops[0], target, mode);
17879   create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
17880   expand_insn (icode, 2, ops);
17881
17882   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
17883   for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
17884     emit_insr (target, builder.elt (i));
17885 }
17886
17887 /* Subroutine of aarch64_sve_expand_vector_init to handle case
17888    when all trailing elements of builder are same.
17889    This works as follows:
17890    (a) Use expand_insn interface to broadcast last vector element in TARGET.
17891    (b) Insert remaining elements in TARGET using insr.
17892
17893    ??? The heuristic used is to do above if number of same trailing elements
17894    is at least 3/4 of total number of elements, loosely based on
17895    heuristic from mostly_zeros_p.  May need fine-tuning.  */
17896
17897 static bool
17898 aarch64_sve_expand_vector_init_handle_trailing_same_elem
17899  (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
17900 {
17901   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
17902   if (ndups >= (3 * nelts_reqd) / 4)
17903     {
17904       aarch64_sve_expand_vector_init_insert_elems (target, builder,
17905                                                    nelts_reqd - ndups + 1);
17906       return true;
17907     }
17908
17909   return false;
17910 }
17911
17912 /* Initialize register TARGET from BUILDER. NELTS is the constant number
17913    of elements in BUILDER.
17914
17915    The function tries to initialize TARGET from BUILDER if it fits one
17916    of the special cases outlined below.
17917
17918    Failing that, the function divides BUILDER into two sub-vectors:
17919    v_even = even elements of BUILDER;
17920    v_odd = odd elements of BUILDER;
17921
17922    and recursively calls itself with v_even and v_odd.
17923
17924    if (recursive call succeeded for v_even or v_odd)
17925      TARGET = zip (v_even, v_odd)
17926
17927    The function returns true if it managed to build TARGET from BUILDER
17928    with one of the special cases, false otherwise.
17929
17930    Example: {a, 1, b, 2, c, 3, d, 4}
17931
17932    The vector gets divided into:
17933    v_even = {a, b, c, d}
17934    v_odd = {1, 2, 3, 4}
17935
17936    aarch64_sve_expand_vector_init(v_odd) hits case 1 and
17937    initialize tmp2 from constant vector v_odd using emit_move_insn.
17938
17939    aarch64_sve_expand_vector_init(v_even) fails since v_even contains
17940    4 elements, so we construct tmp1 from v_even using insr:
17941    tmp1 = dup(d)
17942    insr tmp1, c
17943    insr tmp1, b
17944    insr tmp1, a
17945
17946    And finally:
17947    TARGET = zip (tmp1, tmp2)
17948    which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
17949
17950 static bool
17951 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
17952                                 int nelts, int nelts_reqd)
17953 {
17954   machine_mode mode = GET_MODE (target);
17955
17956   /* Case 1: Vector contains trailing constants.  */
17957
17958   if (aarch64_sve_expand_vector_init_handle_trailing_constants
17959        (target, builder, nelts, nelts_reqd))
17960     return true;
17961
17962   /* Case 2: Vector contains leading constants.  */
17963
17964   rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
17965   for (int i = 0; i < nelts_reqd; i++)
17966     rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
17967   rev_builder.finalize ();
17968
17969   if (aarch64_sve_expand_vector_init_handle_trailing_constants
17970        (target, rev_builder, nelts, nelts_reqd))
17971     {
17972       emit_insn (gen_aarch64_sve_rev (mode, target, target));
17973       return true;
17974     }
17975
17976   /* Case 3: Vector contains trailing same element.  */
17977
17978   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
17979        (target, builder, nelts_reqd))
17980     return true;
17981
17982   /* Case 4: Vector contains leading same element.  */
17983
17984   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
17985        (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
17986     {
17987       emit_insn (gen_aarch64_sve_rev (mode, target, target));
17988       return true;
17989     }
17990
17991   /* Avoid recursing below 4-elements.
17992      ??? The threshold 4 may need fine-tuning.  */
17993
17994   if (nelts_reqd <= 4)
17995     return false;
17996
17997   rtx_vector_builder v_even (mode, 1, nelts);
17998   rtx_vector_builder v_odd (mode, 1, nelts);
17999
18000   for (int i = 0; i < nelts * 2; i += 2)
18001     {
18002       v_even.quick_push (builder.elt (i));
18003       v_odd.quick_push (builder.elt (i + 1));
18004     }
18005
18006   v_even.finalize ();
18007   v_odd.finalize ();
18008
18009   rtx tmp1 = gen_reg_rtx (mode);
18010   bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
18011                                                     nelts, nelts_reqd / 2);
18012
18013   rtx tmp2 = gen_reg_rtx (mode);
18014   bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
18015                                                    nelts, nelts_reqd / 2);
18016
18017   if (!did_even_p && !did_odd_p)
18018     return false;
18019
18020   /* Initialize v_even and v_odd using INSR if it didn't match any of the
18021      special cases and zip v_even, v_odd.  */
18022
18023   if (!did_even_p)
18024     aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
18025
18026   if (!did_odd_p)
18027     aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
18028
18029   rtvec v = gen_rtvec (2, tmp1, tmp2);
18030   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
18031   return true;
18032 }
18033
18034 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
18035
18036 void
18037 aarch64_sve_expand_vector_init (rtx target, rtx vals)
18038 {
18039   machine_mode mode = GET_MODE (target);
18040   int nelts = XVECLEN (vals, 0);
18041
18042   rtx_vector_builder v (mode, 1, nelts);
18043   for (int i = 0; i < nelts; i++)
18044     v.quick_push (XVECEXP (vals, 0, i));
18045   v.finalize ();
18046
18047   /* If neither sub-vectors of v could be initialized specially,
18048      then use INSR to insert all elements from v into TARGET.
18049      ??? This might not be optimal for vectors with large
18050      initializers like 16-element or above.
18051      For nelts < 4, it probably isn't useful to handle specially.  */
18052
18053   if (nelts < 4
18054       || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
18055     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
18056 }
18057
18058 /* Check whether VALUE is a vector constant in which every element
18059    is either a power of 2 or a negated power of 2.  If so, return
18060    a constant vector of log2s, and flip CODE between PLUS and MINUS
18061    if VALUE contains negated powers of 2.  Return NULL_RTX otherwise.  */
18062
18063 static rtx
18064 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
18065 {
18066   if (GET_CODE (value) != CONST_VECTOR)
18067     return NULL_RTX;
18068
18069   rtx_vector_builder builder;
18070   if (!builder.new_unary_operation (GET_MODE (value), value, false))
18071     return NULL_RTX;
18072
18073   scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
18074   /* 1 if the result of the multiplication must be negated,
18075      0 if it mustn't, or -1 if we don't yet care.  */
18076   int negate = -1;
18077   unsigned int encoded_nelts = const_vector_encoded_nelts (value);
18078   for (unsigned int i = 0; i < encoded_nelts; ++i)
18079     {
18080       rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
18081       if (!CONST_SCALAR_INT_P (elt))
18082         return NULL_RTX;
18083       rtx_mode_t val (elt, int_mode);
18084       wide_int pow2 = wi::neg (val);
18085       if (val != pow2)
18086         {
18087           /* It matters whether we negate or not.  Make that choice,
18088              and make sure that it's consistent with previous elements.  */
18089           if (negate == !wi::neg_p (val))
18090             return NULL_RTX;
18091           negate = wi::neg_p (val);
18092           if (!negate)
18093             pow2 = val;
18094         }
18095       /* POW2 is now the value that we want to be a power of 2.  */
18096       int shift = wi::exact_log2 (pow2);
18097       if (shift < 0)
18098         return NULL_RTX;
18099       builder.quick_push (gen_int_mode (shift, int_mode));
18100     }
18101   if (negate == -1)
18102     /* PLUS and MINUS are equivalent; canonicalize on PLUS.  */
18103     code = PLUS;
18104   else if (negate == 1)
18105     code = code == PLUS ? MINUS : PLUS;
18106   return builder.build ();
18107 }
18108
18109 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
18110    CODE is PLUS for the former and MINUS for the latter.  OPERANDS is the
18111    operands array, in the same order as for fma_optab.  Return true if
18112    the function emitted all the necessary instructions, false if the caller
18113    should generate the pattern normally with the new OPERANDS array.  */
18114
18115 bool
18116 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
18117 {
18118   machine_mode mode = GET_MODE (operands[0]);
18119   if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
18120     {
18121       rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
18122                                   NULL_RTX, true, OPTAB_DIRECT);
18123       force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
18124                           operands[3], product, operands[0], true,
18125                           OPTAB_DIRECT);
18126       return true;
18127     }
18128   operands[2] = force_reg (mode, operands[2]);
18129   return false;
18130 }
18131
18132 /* Likewise, but for a conditional pattern.  */
18133
18134 bool
18135 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
18136 {
18137   machine_mode mode = GET_MODE (operands[0]);
18138   if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
18139     {
18140       rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
18141                                   NULL_RTX, true, OPTAB_DIRECT);
18142       emit_insn (gen_cond (code, mode, operands[0], operands[1],
18143                            operands[4], product, operands[5]));
18144       return true;
18145     }
18146   operands[3] = force_reg (mode, operands[3]);
18147   return false;
18148 }
18149
18150 static unsigned HOST_WIDE_INT
18151 aarch64_shift_truncation_mask (machine_mode mode)
18152 {
18153   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
18154     return 0;
18155   return GET_MODE_UNIT_BITSIZE (mode) - 1;
18156 }
18157
18158 /* Select a format to encode pointers in exception handling data.  */
18159 int
18160 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
18161 {
18162    int type;
18163    switch (aarch64_cmodel)
18164      {
18165      case AARCH64_CMODEL_TINY:
18166      case AARCH64_CMODEL_TINY_PIC:
18167      case AARCH64_CMODEL_SMALL:
18168      case AARCH64_CMODEL_SMALL_PIC:
18169      case AARCH64_CMODEL_SMALL_SPIC:
18170        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
18171           for everything.  */
18172        type = DW_EH_PE_sdata4;
18173        break;
18174      default:
18175        /* No assumptions here.  8-byte relocs required.  */
18176        type = DW_EH_PE_sdata8;
18177        break;
18178      }
18179    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
18180 }
18181
18182 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
18183
18184 static void
18185 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
18186 {
18187   if (TREE_CODE (decl) == FUNCTION_DECL)
18188     {
18189       arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
18190       if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
18191         {
18192           fprintf (stream, "\t.variant_pcs\t");
18193           assemble_name (stream, name);
18194           fprintf (stream, "\n");
18195         }
18196     }
18197 }
18198
18199 /* The last .arch and .tune assembly strings that we printed.  */
18200 static std::string aarch64_last_printed_arch_string;
18201 static std::string aarch64_last_printed_tune_string;
18202
18203 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
18204    by the function fndecl.  */
18205
18206 void
18207 aarch64_declare_function_name (FILE *stream, const char* name,
18208                                 tree fndecl)
18209 {
18210   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
18211
18212   struct cl_target_option *targ_options;
18213   if (target_parts)
18214     targ_options = TREE_TARGET_OPTION (target_parts);
18215   else
18216     targ_options = TREE_TARGET_OPTION (target_option_current_node);
18217   gcc_assert (targ_options);
18218
18219   const struct processor *this_arch
18220     = aarch64_get_arch (targ_options->x_explicit_arch);
18221
18222   uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
18223   std::string extension
18224     = aarch64_get_extension_string_for_isa_flags (isa_flags,
18225                                                   this_arch->flags);
18226   /* Only update the assembler .arch string if it is distinct from the last
18227      such string we printed.  */
18228   std::string to_print = this_arch->name + extension;
18229   if (to_print != aarch64_last_printed_arch_string)
18230     {
18231       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
18232       aarch64_last_printed_arch_string = to_print;
18233     }
18234
18235   /* Print the cpu name we're tuning for in the comments, might be
18236      useful to readers of the generated asm.  Do it only when it changes
18237      from function to function and verbose assembly is requested.  */
18238   const struct processor *this_tune
18239     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
18240
18241   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
18242     {
18243       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
18244                    this_tune->name);
18245       aarch64_last_printed_tune_string = this_tune->name;
18246     }
18247
18248   aarch64_asm_output_variant_pcs (stream, fndecl, name);
18249
18250   /* Don't forget the type directive for ELF.  */
18251   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
18252   ASM_OUTPUT_LABEL (stream, name);
18253
18254   cfun->machine->label_is_assembled = true;
18255 }
18256
18257 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY.  Check if the patch area is after
18258    the function label and emit a BTI if necessary.  */
18259
18260 void
18261 aarch64_print_patchable_function_entry (FILE *file,
18262                                         unsigned HOST_WIDE_INT patch_area_size,
18263                                         bool record_p)
18264 {
18265   if (cfun->machine->label_is_assembled
18266       && aarch64_bti_enabled ()
18267       && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
18268     {
18269       /* Remove the BTI that follows the patch area and insert a new BTI
18270          before the patch area right after the function label.  */
18271       rtx_insn *insn = next_real_nondebug_insn (get_insns ());
18272       if (insn
18273           && INSN_P (insn)
18274           && GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
18275           && XINT (PATTERN (insn), 1) == UNSPECV_BTI_C)
18276         delete_insn (insn);
18277       asm_fprintf (file, "\thint\t34 // bti c\n");
18278     }
18279
18280   default_print_patchable_function_entry (file, patch_area_size, record_p);
18281 }
18282
18283 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
18284
18285 void
18286 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
18287 {
18288   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
18289   const char *value = IDENTIFIER_POINTER (target);
18290   aarch64_asm_output_variant_pcs (stream, decl, name);
18291   ASM_OUTPUT_DEF (stream, name, value);
18292 }
18293
18294 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
18295    function symbol references.  */
18296
18297 void
18298 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
18299 {
18300   default_elf_asm_output_external (stream, decl, name);
18301   aarch64_asm_output_variant_pcs (stream, decl, name);
18302 }
18303
18304 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
18305    Used to output the .cfi_b_key_frame directive when signing the current
18306    function with the B key.  */
18307
18308 void
18309 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
18310 {
18311   if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
18312       && aarch64_ra_sign_key == AARCH64_KEY_B)
18313         asm_fprintf (f, "\t.cfi_b_key_frame\n");
18314 }
18315
18316 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
18317
18318 static void
18319 aarch64_start_file (void)
18320 {
18321   struct cl_target_option *default_options
18322     = TREE_TARGET_OPTION (target_option_default_node);
18323
18324   const struct processor *default_arch
18325     = aarch64_get_arch (default_options->x_explicit_arch);
18326   uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
18327   std::string extension
18328     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
18329                                                   default_arch->flags);
18330
18331    aarch64_last_printed_arch_string = default_arch->name + extension;
18332    aarch64_last_printed_tune_string = "";
18333    asm_fprintf (asm_out_file, "\t.arch %s\n",
18334                 aarch64_last_printed_arch_string.c_str ());
18335
18336    default_file_start ();
18337 }
18338
18339 /* Emit load exclusive.  */
18340
18341 static void
18342 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
18343                              rtx mem, rtx model_rtx)
18344 {
18345   if (mode == TImode)
18346     emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
18347                                                 gen_highpart (DImode, rval),
18348                                                 mem, model_rtx));
18349   else
18350     emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
18351 }
18352
18353 /* Emit store exclusive.  */
18354
18355 static void
18356 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
18357                               rtx mem, rtx rval, rtx model_rtx)
18358 {
18359   if (mode == TImode)
18360     emit_insn (gen_aarch64_store_exclusive_pair
18361                (bval, mem, operand_subword (rval, 0, 0, TImode),
18362                 operand_subword (rval, 1, 0, TImode), model_rtx));
18363   else
18364     emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
18365 }
18366
18367 /* Mark the previous jump instruction as unlikely.  */
18368
18369 static void
18370 aarch64_emit_unlikely_jump (rtx insn)
18371 {
18372   rtx_insn *jump = emit_jump_insn (insn);
18373   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
18374 }
18375
18376 /* We store the names of the various atomic helpers in a 5x4 array.
18377    Return the libcall function given MODE, MODEL and NAMES.  */
18378
18379 rtx
18380 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
18381                         const atomic_ool_names *names)
18382 {
18383   memmodel model = memmodel_base (INTVAL (model_rtx));
18384   int mode_idx, model_idx;
18385
18386   switch (mode)
18387     {
18388     case E_QImode:
18389       mode_idx = 0;
18390       break;
18391     case E_HImode:
18392       mode_idx = 1;
18393       break;
18394     case E_SImode:
18395       mode_idx = 2;
18396       break;
18397     case E_DImode:
18398       mode_idx = 3;
18399       break;
18400     case E_TImode:
18401       mode_idx = 4;
18402       break;
18403     default:
18404       gcc_unreachable ();
18405     }
18406
18407   switch (model)
18408     {
18409     case MEMMODEL_RELAXED:
18410       model_idx = 0;
18411       break;
18412     case MEMMODEL_CONSUME:
18413     case MEMMODEL_ACQUIRE:
18414       model_idx = 1;
18415       break;
18416     case MEMMODEL_RELEASE:
18417       model_idx = 2;
18418       break;
18419     case MEMMODEL_ACQ_REL:
18420     case MEMMODEL_SEQ_CST:
18421       model_idx = 3;
18422       break;
18423     default:
18424       gcc_unreachable ();
18425     }
18426
18427   return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
18428                                       VISIBILITY_HIDDEN);
18429 }
18430
18431 #define DEF0(B, N) \
18432   { "__aarch64_" #B #N "_relax", \
18433     "__aarch64_" #B #N "_acq", \
18434     "__aarch64_" #B #N "_rel", \
18435     "__aarch64_" #B #N "_acq_rel" }
18436
18437 #define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
18438                  { NULL, NULL, NULL, NULL }
18439 #define DEF5(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
18440
18441 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
18442 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
18443 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
18444 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
18445 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
18446 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
18447
18448 #undef DEF0
18449 #undef DEF4
18450 #undef DEF5
18451
18452 /* Expand a compare and swap pattern.  */
18453
18454 void
18455 aarch64_expand_compare_and_swap (rtx operands[])
18456 {
18457   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
18458   machine_mode mode, r_mode;
18459
18460   bval = operands[0];
18461   rval = operands[1];
18462   mem = operands[2];
18463   oldval = operands[3];
18464   newval = operands[4];
18465   is_weak = operands[5];
18466   mod_s = operands[6];
18467   mod_f = operands[7];
18468   mode = GET_MODE (mem);
18469
18470   /* Normally the succ memory model must be stronger than fail, but in the
18471      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
18472      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
18473   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
18474       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
18475     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
18476
18477   r_mode = mode;
18478   if (mode == QImode || mode == HImode)
18479     {
18480       r_mode = SImode;
18481       rval = gen_reg_rtx (r_mode);
18482     }
18483
18484   if (TARGET_LSE)
18485     {
18486       /* The CAS insn requires oldval and rval overlap, but we need to
18487          have a copy of oldval saved across the operation to tell if
18488          the operation is successful.  */
18489       if (reg_overlap_mentioned_p (rval, oldval))
18490         rval = copy_to_mode_reg (r_mode, oldval);
18491       else
18492         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
18493
18494       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
18495                                                    newval, mod_s));
18496       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
18497     }
18498   else if (TARGET_OUTLINE_ATOMICS)
18499     {
18500       /* Oldval must satisfy compare afterward.  */
18501       if (!aarch64_plus_operand (oldval, mode))
18502         oldval = force_reg (mode, oldval);
18503       rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
18504       rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
18505                                       oldval, mode, newval, mode,
18506                                       XEXP (mem, 0), Pmode);
18507       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
18508     }
18509   else
18510     {
18511       /* The oldval predicate varies by mode.  Test it and force to reg.  */
18512       insn_code code = code_for_aarch64_compare_and_swap (mode);
18513       if (!insn_data[code].operand[2].predicate (oldval, mode))
18514         oldval = force_reg (mode, oldval);
18515
18516       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
18517                                  is_weak, mod_s, mod_f));
18518       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
18519     }
18520
18521   if (r_mode != mode)
18522     rval = gen_lowpart (mode, rval);
18523   emit_move_insn (operands[1], rval);
18524
18525   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
18526   emit_insn (gen_rtx_SET (bval, x));
18527 }
18528
18529 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
18530    sequence implementing an atomic operation.  */
18531
18532 static void
18533 aarch64_emit_post_barrier (enum memmodel model)
18534 {
18535   const enum memmodel base_model = memmodel_base (model);
18536
18537   if (is_mm_sync (model)
18538       && (base_model == MEMMODEL_ACQUIRE
18539           || base_model == MEMMODEL_ACQ_REL
18540           || base_model == MEMMODEL_SEQ_CST))
18541     {
18542       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
18543     }
18544 }
18545
18546 /* Split a compare and swap pattern.  */
18547
18548 void
18549 aarch64_split_compare_and_swap (rtx operands[])
18550 {
18551   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
18552   gcc_assert (epilogue_completed);
18553
18554   rtx rval, mem, oldval, newval, scratch, x, model_rtx;
18555   machine_mode mode;
18556   bool is_weak;
18557   rtx_code_label *label1, *label2;
18558   enum memmodel model;
18559
18560   rval = operands[0];
18561   mem = operands[1];
18562   oldval = operands[2];
18563   newval = operands[3];
18564   is_weak = (operands[4] != const0_rtx);
18565   model_rtx = operands[5];
18566   scratch = operands[7];
18567   mode = GET_MODE (mem);
18568   model = memmodel_from_int (INTVAL (model_rtx));
18569
18570   /* When OLDVAL is zero and we want the strong version we can emit a tighter
18571     loop:
18572     .label1:
18573         LD[A]XR rval, [mem]
18574         CBNZ    rval, .label2
18575         ST[L]XR scratch, newval, [mem]
18576         CBNZ    scratch, .label1
18577     .label2:
18578         CMP     rval, 0.  */
18579   bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
18580                         oldval == const0_rtx && mode != TImode);
18581
18582   label1 = NULL;
18583   if (!is_weak)
18584     {
18585       label1 = gen_label_rtx ();
18586       emit_label (label1);
18587     }
18588   label2 = gen_label_rtx ();
18589
18590   /* The initial load can be relaxed for a __sync operation since a final
18591      barrier will be emitted to stop code hoisting.  */
18592   if (is_mm_sync (model))
18593     aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
18594   else
18595     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
18596
18597   if (strong_zero_p)
18598     x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
18599   else
18600     {
18601       rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
18602       x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
18603     }
18604   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18605                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
18606   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18607
18608   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
18609
18610   if (!is_weak)
18611     {
18612       if (aarch64_track_speculation)
18613         {
18614           /* Emit an explicit compare instruction, so that we can correctly
18615              track the condition codes.  */
18616           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
18617           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
18618         }
18619       else
18620         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
18621
18622       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18623                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
18624       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18625     }
18626   else
18627     aarch64_gen_compare_reg (NE, scratch, const0_rtx);
18628
18629   emit_label (label2);
18630
18631   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
18632      to set the condition flags.  If this is not used it will be removed by
18633      later passes.  */
18634   if (strong_zero_p)
18635     aarch64_gen_compare_reg (NE, rval, const0_rtx);
18636
18637   /* Emit any final barrier needed for a __sync operation.  */
18638   if (is_mm_sync (model))
18639     aarch64_emit_post_barrier (model);
18640 }
18641
18642 /* Split an atomic operation.  */
18643
18644 void
18645 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
18646                          rtx value, rtx model_rtx, rtx cond)
18647 {
18648   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
18649   gcc_assert (epilogue_completed);
18650
18651   machine_mode mode = GET_MODE (mem);
18652   machine_mode wmode = (mode == DImode ? DImode : SImode);
18653   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
18654   const bool is_sync = is_mm_sync (model);
18655   rtx_code_label *label;
18656   rtx x;
18657
18658   /* Split the atomic operation into a sequence.  */
18659   label = gen_label_rtx ();
18660   emit_label (label);
18661
18662   if (new_out)
18663     new_out = gen_lowpart (wmode, new_out);
18664   if (old_out)
18665     old_out = gen_lowpart (wmode, old_out);
18666   else
18667     old_out = new_out;
18668   value = simplify_gen_subreg (wmode, value, mode, 0);
18669
18670   /* The initial load can be relaxed for a __sync operation since a final
18671      barrier will be emitted to stop code hoisting.  */
18672  if (is_sync)
18673     aarch64_emit_load_exclusive (mode, old_out, mem,
18674                                  GEN_INT (MEMMODEL_RELAXED));
18675   else
18676     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
18677
18678   switch (code)
18679     {
18680     case SET:
18681       new_out = value;
18682       break;
18683
18684     case NOT:
18685       x = gen_rtx_AND (wmode, old_out, value);
18686       emit_insn (gen_rtx_SET (new_out, x));
18687       x = gen_rtx_NOT (wmode, new_out);
18688       emit_insn (gen_rtx_SET (new_out, x));
18689       break;
18690
18691     case MINUS:
18692       if (CONST_INT_P (value))
18693         {
18694           value = GEN_INT (-INTVAL (value));
18695           code = PLUS;
18696         }
18697       /* Fall through.  */
18698
18699     default:
18700       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
18701       emit_insn (gen_rtx_SET (new_out, x));
18702       break;
18703     }
18704
18705   aarch64_emit_store_exclusive (mode, cond, mem,
18706                                 gen_lowpart (mode, new_out), model_rtx);
18707
18708   if (aarch64_track_speculation)
18709     {
18710       /* Emit an explicit compare instruction, so that we can correctly
18711          track the condition codes.  */
18712       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
18713       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
18714     }
18715   else
18716     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
18717
18718   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18719                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
18720   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18721
18722   /* Emit any final barrier needed for a __sync operation.  */
18723   if (is_sync)
18724     aarch64_emit_post_barrier (model);
18725 }
18726
18727 static void
18728 aarch64_init_libfuncs (void)
18729 {
18730    /* Half-precision float operations.  The compiler handles all operations
18731      with NULL libfuncs by converting to SFmode.  */
18732
18733   /* Conversions.  */
18734   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
18735   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
18736
18737   /* Arithmetic.  */
18738   set_optab_libfunc (add_optab, HFmode, NULL);
18739   set_optab_libfunc (sdiv_optab, HFmode, NULL);
18740   set_optab_libfunc (smul_optab, HFmode, NULL);
18741   set_optab_libfunc (neg_optab, HFmode, NULL);
18742   set_optab_libfunc (sub_optab, HFmode, NULL);
18743
18744   /* Comparisons.  */
18745   set_optab_libfunc (eq_optab, HFmode, NULL);
18746   set_optab_libfunc (ne_optab, HFmode, NULL);
18747   set_optab_libfunc (lt_optab, HFmode, NULL);
18748   set_optab_libfunc (le_optab, HFmode, NULL);
18749   set_optab_libfunc (ge_optab, HFmode, NULL);
18750   set_optab_libfunc (gt_optab, HFmode, NULL);
18751   set_optab_libfunc (unord_optab, HFmode, NULL);
18752 }
18753
18754 /* Target hook for c_mode_for_suffix.  */
18755 static machine_mode
18756 aarch64_c_mode_for_suffix (char suffix)
18757 {
18758   if (suffix == 'q')
18759     return TFmode;
18760
18761   return VOIDmode;
18762 }
18763
18764 /* We can only represent floating point constants which will fit in
18765    "quarter-precision" values.  These values are characterised by
18766    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
18767    by:
18768
18769    (-1)^s * (n/16) * 2^r
18770
18771    Where:
18772      's' is the sign bit.
18773      'n' is an integer in the range 16 <= n <= 31.
18774      'r' is an integer in the range -3 <= r <= 4.  */
18775
18776 /* Return true iff X can be represented by a quarter-precision
18777    floating point immediate operand X.  Note, we cannot represent 0.0.  */
18778 bool
18779 aarch64_float_const_representable_p (rtx x)
18780 {
18781   /* This represents our current view of how many bits
18782      make up the mantissa.  */
18783   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
18784   int exponent;
18785   unsigned HOST_WIDE_INT mantissa, mask;
18786   REAL_VALUE_TYPE r, m;
18787   bool fail;
18788
18789   x = unwrap_const_vec_duplicate (x);
18790   if (!CONST_DOUBLE_P (x))
18791     return false;
18792
18793   if (GET_MODE (x) == VOIDmode
18794       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
18795     return false;
18796
18797   r = *CONST_DOUBLE_REAL_VALUE (x);
18798
18799   /* We cannot represent infinities, NaNs or +/-zero.  We won't
18800      know if we have +zero until we analyse the mantissa, but we
18801      can reject the other invalid values.  */
18802   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
18803       || REAL_VALUE_MINUS_ZERO (r))
18804     return false;
18805
18806   /* Extract exponent.  */
18807   r = real_value_abs (&r);
18808   exponent = REAL_EXP (&r);
18809
18810   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
18811      highest (sign) bit, with a fixed binary point at bit point_pos.
18812      m1 holds the low part of the mantissa, m2 the high part.
18813      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
18814      bits for the mantissa, this can fail (low bits will be lost).  */
18815   real_ldexp (&m, &r, point_pos - exponent);
18816   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
18817
18818   /* If the low part of the mantissa has bits set we cannot represent
18819      the value.  */
18820   if (w.ulow () != 0)
18821     return false;
18822   /* We have rejected the lower HOST_WIDE_INT, so update our
18823      understanding of how many bits lie in the mantissa and
18824      look only at the high HOST_WIDE_INT.  */
18825   mantissa = w.elt (1);
18826   point_pos -= HOST_BITS_PER_WIDE_INT;
18827
18828   /* We can only represent values with a mantissa of the form 1.xxxx.  */
18829   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
18830   if ((mantissa & mask) != 0)
18831     return false;
18832
18833   /* Having filtered unrepresentable values, we may now remove all
18834      but the highest 5 bits.  */
18835   mantissa >>= point_pos - 5;
18836
18837   /* We cannot represent the value 0.0, so reject it.  This is handled
18838      elsewhere.  */
18839   if (mantissa == 0)
18840     return false;
18841
18842   /* Then, as bit 4 is always set, we can mask it off, leaving
18843      the mantissa in the range [0, 15].  */
18844   mantissa &= ~(1 << 4);
18845   gcc_assert (mantissa <= 15);
18846
18847   /* GCC internally does not use IEEE754-like encoding (where normalized
18848      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
18849      Our mantissa values are shifted 4 places to the left relative to
18850      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
18851      by 5 places to correct for GCC's representation.  */
18852   exponent = 5 - exponent;
18853
18854   return (exponent >= 0 && exponent <= 7);
18855 }
18856
18857 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
18858    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
18859    output MOVI/MVNI, ORR or BIC immediate.  */
18860 char*
18861 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
18862                                    enum simd_immediate_check which)
18863 {
18864   bool is_valid;
18865   static char templ[40];
18866   const char *mnemonic;
18867   const char *shift_op;
18868   unsigned int lane_count = 0;
18869   char element_char;
18870
18871   struct simd_immediate_info info;
18872
18873   /* This will return true to show const_vector is legal for use as either
18874      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
18875      It will also update INFO to show how the immediate should be generated.
18876      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
18877   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
18878   gcc_assert (is_valid);
18879
18880   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
18881   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
18882
18883   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
18884     {
18885       gcc_assert (info.insn == simd_immediate_info::MOV
18886                   && info.u.mov.shift == 0);
18887       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
18888          move immediate path.  */
18889       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
18890         info.u.mov.value = GEN_INT (0);
18891       else
18892         {
18893           const unsigned int buf_size = 20;
18894           char float_buf[buf_size] = {'\0'};
18895           real_to_decimal_for_mode (float_buf,
18896                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
18897                                     buf_size, buf_size, 1, info.elt_mode);
18898
18899           if (lane_count == 1)
18900             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
18901           else
18902             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
18903                       lane_count, element_char, float_buf);
18904           return templ;
18905         }
18906     }
18907
18908   gcc_assert (CONST_INT_P (info.u.mov.value));
18909
18910   if (which == AARCH64_CHECK_MOV)
18911     {
18912       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
18913       shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
18914                   ? "msl" : "lsl");
18915       if (lane_count == 1)
18916         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
18917                   mnemonic, UINTVAL (info.u.mov.value));
18918       else if (info.u.mov.shift)
18919         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
18920                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
18921                   element_char, UINTVAL (info.u.mov.value), shift_op,
18922                   info.u.mov.shift);
18923       else
18924         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
18925                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
18926                   element_char, UINTVAL (info.u.mov.value));
18927     }
18928   else
18929     {
18930       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
18931       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
18932       if (info.u.mov.shift)
18933         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
18934                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
18935                   element_char, UINTVAL (info.u.mov.value), "lsl",
18936                   info.u.mov.shift);
18937       else
18938         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
18939                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
18940                   element_char, UINTVAL (info.u.mov.value));
18941     }
18942   return templ;
18943 }
18944
18945 char*
18946 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
18947 {
18948
18949   /* If a floating point number was passed and we desire to use it in an
18950      integer mode do the conversion to integer.  */
18951   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
18952     {
18953       unsigned HOST_WIDE_INT ival;
18954       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
18955           gcc_unreachable ();
18956       immediate = gen_int_mode (ival, mode);
18957     }
18958
18959   machine_mode vmode;
18960   /* use a 64 bit mode for everything except for DI/DF mode, where we use
18961      a 128 bit vector mode.  */
18962   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
18963
18964   vmode = aarch64_simd_container_mode (mode, width);
18965   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
18966   return aarch64_output_simd_mov_immediate (v_op, width);
18967 }
18968
18969 /* Return the output string to use for moving immediate CONST_VECTOR
18970    into an SVE register.  */
18971
18972 char *
18973 aarch64_output_sve_mov_immediate (rtx const_vector)
18974 {
18975   static char templ[40];
18976   struct simd_immediate_info info;
18977   char element_char;
18978
18979   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
18980   gcc_assert (is_valid);
18981
18982   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
18983
18984   machine_mode vec_mode = GET_MODE (const_vector);
18985   if (aarch64_sve_pred_mode_p (vec_mode))
18986     {
18987       static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
18988       if (info.insn == simd_immediate_info::MOV)
18989         {
18990           gcc_assert (info.u.mov.value == const0_rtx);
18991           snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
18992         }
18993       else
18994         {
18995           gcc_assert (info.insn == simd_immediate_info::PTRUE);
18996           unsigned int total_bytes;
18997           if (info.u.pattern == AARCH64_SV_ALL
18998               && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
18999             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
19000                       total_bytes / GET_MODE_SIZE (info.elt_mode));
19001           else
19002             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
19003                       svpattern_token (info.u.pattern));
19004         }
19005       return buf;
19006     }
19007
19008   if (info.insn == simd_immediate_info::INDEX)
19009     {
19010       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
19011                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
19012                 element_char, INTVAL (info.u.index.base),
19013                 INTVAL (info.u.index.step));
19014       return templ;
19015     }
19016
19017   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
19018     {
19019       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
19020         info.u.mov.value = GEN_INT (0);
19021       else
19022         {
19023           const int buf_size = 20;
19024           char float_buf[buf_size] = {};
19025           real_to_decimal_for_mode (float_buf,
19026                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
19027                                     buf_size, buf_size, 1, info.elt_mode);
19028
19029           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
19030                     element_char, float_buf);
19031           return templ;
19032         }
19033     }
19034
19035   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
19036             element_char, INTVAL (info.u.mov.value));
19037   return templ;
19038 }
19039
19040 /* Return the asm template for a PTRUES.  CONST_UNSPEC is the
19041    aarch64_sve_ptrue_svpattern_immediate that describes the predicate
19042    pattern.  */
19043
19044 char *
19045 aarch64_output_sve_ptrues (rtx const_unspec)
19046 {
19047   static char templ[40];
19048
19049   struct simd_immediate_info info;
19050   bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
19051   gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
19052
19053   char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
19054   snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
19055             svpattern_token (info.u.pattern));
19056   return templ;
19057 }
19058
19059 /* Split operands into moves from op[1] + op[2] into op[0].  */
19060
19061 void
19062 aarch64_split_combinev16qi (rtx operands[3])
19063 {
19064   unsigned int dest = REGNO (operands[0]);
19065   unsigned int src1 = REGNO (operands[1]);
19066   unsigned int src2 = REGNO (operands[2]);
19067   machine_mode halfmode = GET_MODE (operands[1]);
19068   unsigned int halfregs = REG_NREGS (operands[1]);
19069   rtx destlo, desthi;
19070
19071   gcc_assert (halfmode == V16QImode);
19072
19073   if (src1 == dest && src2 == dest + halfregs)
19074     {
19075       /* No-op move.  Can't split to nothing; emit something.  */
19076       emit_note (NOTE_INSN_DELETED);
19077       return;
19078     }
19079
19080   /* Preserve register attributes for variable tracking.  */
19081   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
19082   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
19083                                GET_MODE_SIZE (halfmode));
19084
19085   /* Special case of reversed high/low parts.  */
19086   if (reg_overlap_mentioned_p (operands[2], destlo)
19087       && reg_overlap_mentioned_p (operands[1], desthi))
19088     {
19089       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
19090       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
19091       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
19092     }
19093   else if (!reg_overlap_mentioned_p (operands[2], destlo))
19094     {
19095       /* Try to avoid unnecessary moves if part of the result
19096          is in the right place already.  */
19097       if (src1 != dest)
19098         emit_move_insn (destlo, operands[1]);
19099       if (src2 != dest + halfregs)
19100         emit_move_insn (desthi, operands[2]);
19101     }
19102   else
19103     {
19104       if (src2 != dest + halfregs)
19105         emit_move_insn (desthi, operands[2]);
19106       if (src1 != dest)
19107         emit_move_insn (destlo, operands[1]);
19108     }
19109 }
19110
19111 /* vec_perm support.  */
19112
19113 struct expand_vec_perm_d
19114 {
19115   rtx target, op0, op1;
19116   vec_perm_indices perm;
19117   machine_mode vmode;
19118   unsigned int vec_flags;
19119   bool one_vector_p;
19120   bool testing_p;
19121 };
19122
19123 /* Generate a variable permutation.  */
19124
19125 static void
19126 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
19127 {
19128   machine_mode vmode = GET_MODE (target);
19129   bool one_vector_p = rtx_equal_p (op0, op1);
19130
19131   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
19132   gcc_checking_assert (GET_MODE (op0) == vmode);
19133   gcc_checking_assert (GET_MODE (op1) == vmode);
19134   gcc_checking_assert (GET_MODE (sel) == vmode);
19135   gcc_checking_assert (TARGET_SIMD);
19136
19137   if (one_vector_p)
19138     {
19139       if (vmode == V8QImode)
19140         {
19141           /* Expand the argument to a V16QI mode by duplicating it.  */
19142           rtx pair = gen_reg_rtx (V16QImode);
19143           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
19144           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
19145         }
19146       else
19147         {
19148           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
19149         }
19150     }
19151   else
19152     {
19153       rtx pair;
19154
19155       if (vmode == V8QImode)
19156         {
19157           pair = gen_reg_rtx (V16QImode);
19158           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
19159           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
19160         }
19161       else
19162         {
19163           pair = gen_reg_rtx (OImode);
19164           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
19165           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
19166         }
19167     }
19168 }
19169
19170 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
19171    NELT is the number of elements in the vector.  */
19172
19173 void
19174 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
19175                          unsigned int nelt)
19176 {
19177   machine_mode vmode = GET_MODE (target);
19178   bool one_vector_p = rtx_equal_p (op0, op1);
19179   rtx mask;
19180
19181   /* The TBL instruction does not use a modulo index, so we must take care
19182      of that ourselves.  */
19183   mask = aarch64_simd_gen_const_vector_dup (vmode,
19184       one_vector_p ? nelt - 1 : 2 * nelt - 1);
19185   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
19186
19187   /* For big-endian, we also need to reverse the index within the vector
19188      (but not which vector).  */
19189   if (BYTES_BIG_ENDIAN)
19190     {
19191       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
19192       if (!one_vector_p)
19193         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
19194       sel = expand_simple_binop (vmode, XOR, sel, mask,
19195                                  NULL, 0, OPTAB_LIB_WIDEN);
19196     }
19197   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
19198 }
19199
19200 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
19201
19202 static void
19203 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
19204 {
19205   emit_insn (gen_rtx_SET (target,
19206                           gen_rtx_UNSPEC (GET_MODE (target),
19207                                           gen_rtvec (2, op0, op1), code)));
19208 }
19209
19210 /* Expand an SVE vec_perm with the given operands.  */
19211
19212 void
19213 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
19214 {
19215   machine_mode data_mode = GET_MODE (target);
19216   machine_mode sel_mode = GET_MODE (sel);
19217   /* Enforced by the pattern condition.  */
19218   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
19219
19220   /* Note: vec_perm indices are supposed to wrap when they go beyond the
19221      size of the two value vectors, i.e. the upper bits of the indices
19222      are effectively ignored.  SVE TBL instead produces 0 for any
19223      out-of-range indices, so we need to modulo all the vec_perm indices
19224      to ensure they are all in range.  */
19225   rtx sel_reg = force_reg (sel_mode, sel);
19226
19227   /* Check if the sel only references the first values vector.  */
19228   if (GET_CODE (sel) == CONST_VECTOR
19229       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
19230     {
19231       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
19232       return;
19233     }
19234
19235   /* Check if the two values vectors are the same.  */
19236   if (rtx_equal_p (op0, op1))
19237     {
19238       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
19239       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
19240                                          NULL, 0, OPTAB_DIRECT);
19241       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
19242       return;
19243     }
19244
19245   /* Run TBL on for each value vector and combine the results.  */
19246
19247   rtx res0 = gen_reg_rtx (data_mode);
19248   rtx res1 = gen_reg_rtx (data_mode);
19249   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
19250   if (GET_CODE (sel) != CONST_VECTOR
19251       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
19252     {
19253       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
19254                                                        2 * nunits - 1);
19255       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
19256                                      NULL, 0, OPTAB_DIRECT);
19257     }
19258   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
19259   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
19260                                      NULL, 0, OPTAB_DIRECT);
19261   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
19262   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
19263     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
19264   else
19265     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
19266 }
19267
19268 /* Recognize patterns suitable for the TRN instructions.  */
19269 static bool
19270 aarch64_evpc_trn (struct expand_vec_perm_d *d)
19271 {
19272   HOST_WIDE_INT odd;
19273   poly_uint64 nelt = d->perm.length ();
19274   rtx out, in0, in1, x;
19275   machine_mode vmode = d->vmode;
19276
19277   if (GET_MODE_UNIT_SIZE (vmode) > 8)
19278     return false;
19279
19280   /* Note that these are little-endian tests.
19281      We correct for big-endian later.  */
19282   if (!d->perm[0].is_constant (&odd)
19283       || (odd != 0 && odd != 1)
19284       || !d->perm.series_p (0, 2, odd, 2)
19285       || !d->perm.series_p (1, 2, nelt + odd, 2))
19286     return false;
19287
19288   /* Success!  */
19289   if (d->testing_p)
19290     return true;
19291
19292   in0 = d->op0;
19293   in1 = d->op1;
19294   /* We don't need a big-endian lane correction for SVE; see the comment
19295      at the head of aarch64-sve.md for details.  */
19296   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
19297     {
19298       x = in0, in0 = in1, in1 = x;
19299       odd = !odd;
19300     }
19301   out = d->target;
19302
19303   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
19304                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
19305   return true;
19306 }
19307
19308 /* Recognize patterns suitable for the UZP instructions.  */
19309 static bool
19310 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
19311 {
19312   HOST_WIDE_INT odd;
19313   rtx out, in0, in1, x;
19314   machine_mode vmode = d->vmode;
19315
19316   if (GET_MODE_UNIT_SIZE (vmode) > 8)
19317     return false;
19318
19319   /* Note that these are little-endian tests.
19320      We correct for big-endian later.  */
19321   if (!d->perm[0].is_constant (&odd)
19322       || (odd != 0 && odd != 1)
19323       || !d->perm.series_p (0, 1, odd, 2))
19324     return false;
19325
19326   /* Success!  */
19327   if (d->testing_p)
19328     return true;
19329
19330   in0 = d->op0;
19331   in1 = d->op1;
19332   /* We don't need a big-endian lane correction for SVE; see the comment
19333      at the head of aarch64-sve.md for details.  */
19334   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
19335     {
19336       x = in0, in0 = in1, in1 = x;
19337       odd = !odd;
19338     }
19339   out = d->target;
19340
19341   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
19342                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
19343   return true;
19344 }
19345
19346 /* Recognize patterns suitable for the ZIP instructions.  */
19347 static bool
19348 aarch64_evpc_zip (struct expand_vec_perm_d *d)
19349 {
19350   unsigned int high;
19351   poly_uint64 nelt = d->perm.length ();
19352   rtx out, in0, in1, x;
19353   machine_mode vmode = d->vmode;
19354
19355   if (GET_MODE_UNIT_SIZE (vmode) > 8)
19356     return false;
19357
19358   /* Note that these are little-endian tests.
19359      We correct for big-endian later.  */
19360   poly_uint64 first = d->perm[0];
19361   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
19362       || !d->perm.series_p (0, 2, first, 1)
19363       || !d->perm.series_p (1, 2, first + nelt, 1))
19364     return false;
19365   high = maybe_ne (first, 0U);
19366
19367   /* Success!  */
19368   if (d->testing_p)
19369     return true;
19370
19371   in0 = d->op0;
19372   in1 = d->op1;
19373   /* We don't need a big-endian lane correction for SVE; see the comment
19374      at the head of aarch64-sve.md for details.  */
19375   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
19376     {
19377       x = in0, in0 = in1, in1 = x;
19378       high = !high;
19379     }
19380   out = d->target;
19381
19382   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
19383                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
19384   return true;
19385 }
19386
19387 /* Recognize patterns for the EXT insn.  */
19388
19389 static bool
19390 aarch64_evpc_ext (struct expand_vec_perm_d *d)
19391 {
19392   HOST_WIDE_INT location;
19393   rtx offset;
19394
19395   /* The first element always refers to the first vector.
19396      Check if the extracted indices are increasing by one.  */
19397   if (d->vec_flags == VEC_SVE_PRED
19398       || !d->perm[0].is_constant (&location)
19399       || !d->perm.series_p (0, 1, location, 1))
19400     return false;
19401
19402   /* Success! */
19403   if (d->testing_p)
19404     return true;
19405
19406   /* The case where (location == 0) is a no-op for both big- and little-endian,
19407      and is removed by the mid-end at optimization levels -O1 and higher.
19408
19409      We don't need a big-endian lane correction for SVE; see the comment
19410      at the head of aarch64-sve.md for details.  */
19411   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
19412     {
19413       /* After setup, we want the high elements of the first vector (stored
19414          at the LSB end of the register), and the low elements of the second
19415          vector (stored at the MSB end of the register). So swap.  */
19416       std::swap (d->op0, d->op1);
19417       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
19418          to_constant () is safe since this is restricted to Advanced SIMD
19419          vectors.  */
19420       location = d->perm.length ().to_constant () - location;
19421     }
19422
19423   offset = GEN_INT (location);
19424   emit_set_insn (d->target,
19425                  gen_rtx_UNSPEC (d->vmode,
19426                                  gen_rtvec (3, d->op0, d->op1, offset),
19427                                  UNSPEC_EXT));
19428   return true;
19429 }
19430
19431 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
19432    within each 64-bit, 32-bit or 16-bit granule.  */
19433
19434 static bool
19435 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
19436 {
19437   HOST_WIDE_INT diff;
19438   unsigned int i, size, unspec;
19439   machine_mode pred_mode;
19440
19441   if (d->vec_flags == VEC_SVE_PRED
19442       || !d->one_vector_p
19443       || !d->perm[0].is_constant (&diff))
19444     return false;
19445
19446   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
19447   if (size == 8)
19448     {
19449       unspec = UNSPEC_REV64;
19450       pred_mode = VNx2BImode;
19451     }
19452   else if (size == 4)
19453     {
19454       unspec = UNSPEC_REV32;
19455       pred_mode = VNx4BImode;
19456     }
19457   else if (size == 2)
19458     {
19459       unspec = UNSPEC_REV16;
19460       pred_mode = VNx8BImode;
19461     }
19462   else
19463     return false;
19464
19465   unsigned int step = diff + 1;
19466   for (i = 0; i < step; ++i)
19467     if (!d->perm.series_p (i, step, diff - i, step))
19468       return false;
19469
19470   /* Success! */
19471   if (d->testing_p)
19472     return true;
19473
19474   if (d->vec_flags == VEC_SVE_DATA)
19475     {
19476       machine_mode int_mode = aarch64_sve_int_mode (pred_mode);
19477       rtx target = gen_reg_rtx (int_mode);
19478       if (BYTES_BIG_ENDIAN)
19479         /* The act of taking a subreg between INT_MODE and d->vmode
19480            is itself a reversing operation on big-endian targets;
19481            see the comment at the head of aarch64-sve.md for details.
19482            First reinterpret OP0 as INT_MODE without using a subreg
19483            and without changing the contents.  */
19484         emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0));
19485       else
19486         {
19487           /* For SVE we use REV[BHW] unspecs derived from the element size
19488              of v->mode and vector modes whose elements have SIZE bytes.
19489              This ensures that the vector modes match the predicate modes.  */
19490           int unspec = aarch64_sve_rev_unspec (d->vmode);
19491           rtx pred = aarch64_ptrue_reg (pred_mode);
19492           emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred,
19493                                        gen_lowpart (int_mode, d->op0)));
19494         }
19495       emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19496       return true;
19497     }
19498   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
19499   emit_set_insn (d->target, src);
19500   return true;
19501 }
19502
19503 /* Recognize patterns for the REV insn, which reverses elements within
19504    a full vector.  */
19505
19506 static bool
19507 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
19508 {
19509   poly_uint64 nelt = d->perm.length ();
19510
19511   if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
19512     return false;
19513
19514   if (!d->perm.series_p (0, 1, nelt - 1, -1))
19515     return false;
19516
19517   /* Success! */
19518   if (d->testing_p)
19519     return true;
19520
19521   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
19522   emit_set_insn (d->target, src);
19523   return true;
19524 }
19525
19526 static bool
19527 aarch64_evpc_dup (struct expand_vec_perm_d *d)
19528 {
19529   rtx out = d->target;
19530   rtx in0;
19531   HOST_WIDE_INT elt;
19532   machine_mode vmode = d->vmode;
19533   rtx lane;
19534
19535   if (d->vec_flags == VEC_SVE_PRED
19536       || d->perm.encoding ().encoded_nelts () != 1
19537       || !d->perm[0].is_constant (&elt))
19538     return false;
19539
19540   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
19541     return false;
19542
19543   /* Success! */
19544   if (d->testing_p)
19545     return true;
19546
19547   /* The generic preparation in aarch64_expand_vec_perm_const_1
19548      swaps the operand order and the permute indices if it finds
19549      d->perm[0] to be in the second operand.  Thus, we can always
19550      use d->op0 and need not do any extra arithmetic to get the
19551      correct lane number.  */
19552   in0 = d->op0;
19553   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
19554
19555   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
19556   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
19557   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
19558   return true;
19559 }
19560
19561 static bool
19562 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
19563 {
19564   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
19565   machine_mode vmode = d->vmode;
19566
19567   /* Make sure that the indices are constant.  */
19568   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
19569   for (unsigned int i = 0; i < encoded_nelts; ++i)
19570     if (!d->perm[i].is_constant ())
19571       return false;
19572
19573   if (d->testing_p)
19574     return true;
19575
19576   /* Generic code will try constant permutation twice.  Once with the
19577      original mode and again with the elements lowered to QImode.
19578      So wait and don't do the selector expansion ourselves.  */
19579   if (vmode != V8QImode && vmode != V16QImode)
19580     return false;
19581
19582   /* to_constant is safe since this routine is specific to Advanced SIMD
19583      vectors.  */
19584   unsigned int nelt = d->perm.length ().to_constant ();
19585   for (unsigned int i = 0; i < nelt; ++i)
19586     /* If big-endian and two vectors we end up with a weird mixed-endian
19587        mode on NEON.  Reverse the index within each word but not the word
19588        itself.  to_constant is safe because we checked is_constant above.  */
19589     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
19590                         ? d->perm[i].to_constant () ^ (nelt - 1)
19591                         : d->perm[i].to_constant ());
19592
19593   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
19594   sel = force_reg (vmode, sel);
19595
19596   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
19597   return true;
19598 }
19599
19600 /* Try to implement D using an SVE TBL instruction.  */
19601
19602 static bool
19603 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
19604 {
19605   unsigned HOST_WIDE_INT nelt;
19606
19607   /* Permuting two variable-length vectors could overflow the
19608      index range.  */
19609   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
19610     return false;
19611
19612   if (d->testing_p)
19613     return true;
19614
19615   machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
19616   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
19617   if (d->one_vector_p)
19618     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
19619   else
19620     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
19621   return true;
19622 }
19623
19624 /* Try to implement D using SVE SEL instruction.  */
19625
19626 static bool
19627 aarch64_evpc_sel (struct expand_vec_perm_d *d)
19628 {
19629   machine_mode vmode = d->vmode;
19630   int unit_size = GET_MODE_UNIT_SIZE (vmode);
19631
19632   if (d->vec_flags != VEC_SVE_DATA
19633       || unit_size > 8)
19634     return false;
19635
19636   int n_patterns = d->perm.encoding ().npatterns ();
19637   poly_int64 vec_len = d->perm.length ();
19638
19639   for (int i = 0; i < n_patterns; ++i)
19640     if (!known_eq (d->perm[i], i)
19641         && !known_eq (d->perm[i], vec_len + i))
19642       return false;
19643
19644   for (int i = n_patterns; i < n_patterns * 2; i++)
19645     if (!d->perm.series_p (i, n_patterns, i, n_patterns)
19646         && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
19647       return false;
19648
19649   if (d->testing_p)
19650     return true;
19651
19652   machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
19653
19654   /* Build a predicate that is true when op0 elements should be used.  */
19655   rtx_vector_builder builder (pred_mode, n_patterns, 2);
19656   for (int i = 0; i < n_patterns * 2; i++)
19657     {
19658       rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
19659                                           : CONST0_RTX (BImode);
19660       builder.quick_push (elem);
19661     }
19662
19663   rtx const_vec = builder.build ();
19664   rtx pred = force_reg (pred_mode, const_vec);
19665   /* TARGET = PRED ? OP0 : OP1.  */
19666   emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
19667   return true;
19668 }
19669
19670 static bool
19671 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
19672 {
19673   /* The pattern matching functions above are written to look for a small
19674      number to begin the sequence (0, 1, N/2).  If we begin with an index
19675      from the second operand, we can swap the operands.  */
19676   poly_int64 nelt = d->perm.length ();
19677   if (known_ge (d->perm[0], nelt))
19678     {
19679       d->perm.rotate_inputs (1);
19680       std::swap (d->op0, d->op1);
19681     }
19682
19683   if ((d->vec_flags == VEC_ADVSIMD
19684        || d->vec_flags == VEC_SVE_DATA
19685        || d->vec_flags == VEC_SVE_PRED)
19686       && known_gt (nelt, 1))
19687     {
19688       if (aarch64_evpc_rev_local (d))
19689         return true;
19690       else if (aarch64_evpc_rev_global (d))
19691         return true;
19692       else if (aarch64_evpc_ext (d))
19693         return true;
19694       else if (aarch64_evpc_dup (d))
19695         return true;
19696       else if (aarch64_evpc_zip (d))
19697         return true;
19698       else if (aarch64_evpc_uzp (d))
19699         return true;
19700       else if (aarch64_evpc_trn (d))
19701         return true;
19702       else if (aarch64_evpc_sel (d))
19703         return true;
19704       if (d->vec_flags == VEC_SVE_DATA)
19705         return aarch64_evpc_sve_tbl (d);
19706       else if (d->vec_flags == VEC_ADVSIMD)
19707         return aarch64_evpc_tbl (d);
19708     }
19709   return false;
19710 }
19711
19712 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
19713
19714 static bool
19715 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
19716                                   rtx op1, const vec_perm_indices &sel)
19717 {
19718   struct expand_vec_perm_d d;
19719
19720   /* Check whether the mask can be applied to a single vector.  */
19721   if (sel.ninputs () == 1
19722       || (op0 && rtx_equal_p (op0, op1)))
19723     d.one_vector_p = true;
19724   else if (sel.all_from_input_p (0))
19725     {
19726       d.one_vector_p = true;
19727       op1 = op0;
19728     }
19729   else if (sel.all_from_input_p (1))
19730     {
19731       d.one_vector_p = true;
19732       op0 = op1;
19733     }
19734   else
19735     d.one_vector_p = false;
19736
19737   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
19738                      sel.nelts_per_input ());
19739   d.vmode = vmode;
19740   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
19741   d.target = target;
19742   d.op0 = op0;
19743   d.op1 = op1;
19744   d.testing_p = !target;
19745
19746   if (!d.testing_p)
19747     return aarch64_expand_vec_perm_const_1 (&d);
19748
19749   rtx_insn *last = get_last_insn ();
19750   bool ret = aarch64_expand_vec_perm_const_1 (&d);
19751   gcc_assert (last == get_last_insn ());
19752
19753   return ret;
19754 }
19755
19756 /* Generate a byte permute mask for a register of mode MODE,
19757    which has NUNITS units.  */
19758
19759 rtx
19760 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
19761 {
19762   /* We have to reverse each vector because we dont have
19763      a permuted load that can reverse-load according to ABI rules.  */
19764   rtx mask;
19765   rtvec v = rtvec_alloc (16);
19766   unsigned int i, j;
19767   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
19768
19769   gcc_assert (BYTES_BIG_ENDIAN);
19770   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
19771
19772   for (i = 0; i < nunits; i++)
19773     for (j = 0; j < usize; j++)
19774       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
19775   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
19776   return force_reg (V16QImode, mask);
19777 }
19778
19779 /* Expand an SVE integer comparison using the SVE equivalent of:
19780
19781      (set TARGET (CODE OP0 OP1)).  */
19782
19783 void
19784 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
19785 {
19786   machine_mode pred_mode = GET_MODE (target);
19787   machine_mode data_mode = GET_MODE (op0);
19788   rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
19789                                       op0, op1);
19790   if (!rtx_equal_p (target, res))
19791     emit_move_insn (target, res);
19792 }
19793
19794 /* Return the UNSPEC_COND_* code for comparison CODE.  */
19795
19796 static unsigned int
19797 aarch64_unspec_cond_code (rtx_code code)
19798 {
19799   switch (code)
19800     {
19801     case NE:
19802       return UNSPEC_COND_FCMNE;
19803     case EQ:
19804       return UNSPEC_COND_FCMEQ;
19805     case LT:
19806       return UNSPEC_COND_FCMLT;
19807     case GT:
19808       return UNSPEC_COND_FCMGT;
19809     case LE:
19810       return UNSPEC_COND_FCMLE;
19811     case GE:
19812       return UNSPEC_COND_FCMGE;
19813     case UNORDERED:
19814       return UNSPEC_COND_FCMUO;
19815     default:
19816       gcc_unreachable ();
19817     }
19818 }
19819
19820 /* Emit:
19821
19822       (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
19823
19824    where <X> is the operation associated with comparison CODE.
19825    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
19826
19827 static void
19828 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
19829                           bool known_ptrue_p, rtx op0, rtx op1)
19830 {
19831   rtx flag = gen_int_mode (known_ptrue_p, SImode);
19832   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
19833                                gen_rtvec (4, pred, flag, op0, op1),
19834                                aarch64_unspec_cond_code (code));
19835   emit_set_insn (target, unspec);
19836 }
19837
19838 /* Emit the SVE equivalent of:
19839
19840       (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
19841       (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
19842       (set TARGET (ior:PRED_MODE TMP1 TMP2))
19843
19844    where <Xi> is the operation associated with comparison CODEi.
19845    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
19846
19847 static void
19848 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
19849                               rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
19850 {
19851   machine_mode pred_mode = GET_MODE (pred);
19852   rtx tmp1 = gen_reg_rtx (pred_mode);
19853   aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
19854   rtx tmp2 = gen_reg_rtx (pred_mode);
19855   aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
19856   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
19857 }
19858
19859 /* Emit the SVE equivalent of:
19860
19861       (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
19862       (set TARGET (not TMP))
19863
19864    where <X> is the operation associated with comparison CODE.
19865    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
19866
19867 static void
19868 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
19869                                  bool known_ptrue_p, rtx op0, rtx op1)
19870 {
19871   machine_mode pred_mode = GET_MODE (pred);
19872   rtx tmp = gen_reg_rtx (pred_mode);
19873   aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
19874   aarch64_emit_unop (target, one_cmpl_optab, tmp);
19875 }
19876
19877 /* Expand an SVE floating-point comparison using the SVE equivalent of:
19878
19879      (set TARGET (CODE OP0 OP1))
19880
19881    If CAN_INVERT_P is true, the caller can also handle inverted results;
19882    return true if the result is in fact inverted.  */
19883
19884 bool
19885 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
19886                                   rtx op0, rtx op1, bool can_invert_p)
19887 {
19888   machine_mode pred_mode = GET_MODE (target);
19889   machine_mode data_mode = GET_MODE (op0);
19890
19891   rtx ptrue = aarch64_ptrue_reg (pred_mode);
19892   switch (code)
19893     {
19894     case UNORDERED:
19895       /* UNORDERED has no immediate form.  */
19896       op1 = force_reg (data_mode, op1);
19897       /* fall through */
19898     case LT:
19899     case LE:
19900     case GT:
19901     case GE:
19902     case EQ:
19903     case NE:
19904       {
19905         /* There is native support for the comparison.  */
19906         aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
19907         return false;
19908       }
19909
19910     case LTGT:
19911       /* This is a trapping operation (LT or GT).  */
19912       aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
19913       return false;
19914
19915     case UNEQ:
19916       if (!flag_trapping_math)
19917         {
19918           /* This would trap for signaling NaNs.  */
19919           op1 = force_reg (data_mode, op1);
19920           aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
19921                                         ptrue, true, op0, op1);
19922           return false;
19923         }
19924       /* fall through */
19925     case UNLT:
19926     case UNLE:
19927     case UNGT:
19928     case UNGE:
19929       if (flag_trapping_math)
19930         {
19931           /* Work out which elements are ordered.  */
19932           rtx ordered = gen_reg_rtx (pred_mode);
19933           op1 = force_reg (data_mode, op1);
19934           aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
19935                                            ptrue, true, op0, op1);
19936
19937           /* Test the opposite condition for the ordered elements,
19938              then invert the result.  */
19939           if (code == UNEQ)
19940             code = NE;
19941           else
19942             code = reverse_condition_maybe_unordered (code);
19943           if (can_invert_p)
19944             {
19945               aarch64_emit_sve_fp_cond (target, code,
19946                                         ordered, false, op0, op1);
19947               return true;
19948             }
19949           aarch64_emit_sve_invert_fp_cond (target, code,
19950                                            ordered, false, op0, op1);
19951           return false;
19952         }
19953       break;
19954
19955     case ORDERED:
19956       /* ORDERED has no immediate form.  */
19957       op1 = force_reg (data_mode, op1);
19958       break;
19959
19960     default:
19961       gcc_unreachable ();
19962     }
19963
19964   /* There is native support for the inverse comparison.  */
19965   code = reverse_condition_maybe_unordered (code);
19966   if (can_invert_p)
19967     {
19968       aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
19969       return true;
19970     }
19971   aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
19972   return false;
19973 }
19974
19975 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
19976    of the data being selected and CMP_MODE is the mode of the values being
19977    compared.  */
19978
19979 void
19980 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
19981                           rtx *ops)
19982 {
19983   machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
19984   rtx pred = gen_reg_rtx (pred_mode);
19985   if (FLOAT_MODE_P (cmp_mode))
19986     {
19987       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
19988                                             ops[4], ops[5], true))
19989         std::swap (ops[1], ops[2]);
19990     }
19991   else
19992     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
19993
19994   if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
19995     ops[1] = force_reg (data_mode, ops[1]);
19996   /* The "false" value can only be zero if the "true" value is a constant.  */
19997   if (register_operand (ops[1], data_mode)
19998       || !aarch64_simd_reg_or_zero (ops[2], data_mode))
19999     ops[2] = force_reg (data_mode, ops[2]);
20000
20001   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
20002   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
20003 }
20004
20005 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
20006    true.  However due to issues with register allocation it is preferable
20007    to avoid tieing integer scalar and FP scalar modes.  Executing integer
20008    operations in general registers is better than treating them as scalar
20009    vector operations.  This reduces latency and avoids redundant int<->FP
20010    moves.  So tie modes if they are either the same class, or vector modes
20011    with other vector modes, vector structs or any scalar mode.  */
20012
20013 static bool
20014 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
20015 {
20016   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
20017     return true;
20018
20019   /* We specifically want to allow elements of "structure" modes to
20020      be tieable to the structure.  This more general condition allows
20021      other rarer situations too.  The reason we don't extend this to
20022      predicate modes is that there are no predicate structure modes
20023      nor any specific instructions for extracting part of a predicate
20024      register.  */
20025   if (aarch64_vector_data_mode_p (mode1)
20026       && aarch64_vector_data_mode_p (mode2))
20027     return true;
20028
20029   /* Also allow any scalar modes with vectors.  */
20030   if (aarch64_vector_mode_supported_p (mode1)
20031       || aarch64_vector_mode_supported_p (mode2))
20032     return true;
20033
20034   return false;
20035 }
20036
20037 /* Return a new RTX holding the result of moving POINTER forward by
20038    AMOUNT bytes.  */
20039
20040 static rtx
20041 aarch64_move_pointer (rtx pointer, poly_int64 amount)
20042 {
20043   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
20044
20045   return adjust_automodify_address (pointer, GET_MODE (pointer),
20046                                     next, amount);
20047 }
20048
20049 /* Return a new RTX holding the result of moving POINTER forward by the
20050    size of the mode it points to.  */
20051
20052 static rtx
20053 aarch64_progress_pointer (rtx pointer)
20054 {
20055   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
20056 }
20057
20058 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
20059    MODE bytes.  */
20060
20061 static void
20062 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
20063                                               machine_mode mode)
20064 {
20065   rtx reg = gen_reg_rtx (mode);
20066
20067   /* "Cast" the pointers to the correct mode.  */
20068   *src = adjust_address (*src, mode, 0);
20069   *dst = adjust_address (*dst, mode, 0);
20070   /* Emit the memcpy.  */
20071   emit_move_insn (reg, *src);
20072   emit_move_insn (*dst, reg);
20073   /* Move the pointers forward.  */
20074   *src = aarch64_progress_pointer (*src);
20075   *dst = aarch64_progress_pointer (*dst);
20076 }
20077
20078 /* Expand cpymem, as if from a __builtin_memcpy.  Return true if
20079    we succeed, otherwise return false.  */
20080
20081 bool
20082 aarch64_expand_cpymem (rtx *operands)
20083 {
20084   int n, mode_bits;
20085   rtx dst = operands[0];
20086   rtx src = operands[1];
20087   rtx base;
20088   machine_mode cur_mode = BLKmode, next_mode;
20089   bool speed_p = !optimize_function_for_size_p (cfun);
20090
20091   /* When optimizing for size, give a better estimate of the length of a
20092      memcpy call, but use the default otherwise.  Moves larger than 8 bytes
20093      will always require an even number of instructions to do now.  And each
20094      operation requires both a load+store, so devide the max number by 2.  */
20095   int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
20096
20097   /* We can't do anything smart if the amount to copy is not constant.  */
20098   if (!CONST_INT_P (operands[2]))
20099     return false;
20100
20101   n = INTVAL (operands[2]);
20102
20103   /* Try to keep the number of instructions low.  For all cases we will do at
20104      most two moves for the residual amount, since we'll always overlap the
20105      remainder.  */
20106   if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
20107     return false;
20108
20109   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
20110   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
20111
20112   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
20113   src = adjust_automodify_address (src, VOIDmode, base, 0);
20114
20115   /* Convert n to bits to make the rest of the code simpler.  */
20116   n = n * BITS_PER_UNIT;
20117
20118   /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
20119      larger than TImode, but we should not use them for loads/stores here.  */
20120   const int copy_limit = GET_MODE_BITSIZE (TImode);
20121
20122   while (n > 0)
20123     {
20124       /* Find the largest mode in which to do the copy in without over reading
20125          or writing.  */
20126       opt_scalar_int_mode mode_iter;
20127       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
20128         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
20129           cur_mode = mode_iter.require ();
20130
20131       gcc_assert (cur_mode != BLKmode);
20132
20133       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
20134       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
20135
20136       n -= mode_bits;
20137
20138       /* Do certain trailing copies as overlapping if it's going to be
20139          cheaper.  i.e. less instructions to do so.  For instance doing a 15
20140          byte copy it's more efficient to do two overlapping 8 byte copies than
20141          8 + 6 + 1.  */
20142       if (n > 0 && n <= 8 * BITS_PER_UNIT)
20143         {
20144           next_mode = smallest_mode_for_size (n, MODE_INT);
20145           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
20146           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
20147           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
20148           n = n_bits;
20149         }
20150     }
20151
20152   return true;
20153 }
20154
20155 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
20156    SImode stores.  Handle the case when the constant has identical
20157    bottom and top halves.  This is beneficial when the two stores can be
20158    merged into an STP and we avoid synthesising potentially expensive
20159    immediates twice.  Return true if such a split is possible.  */
20160
20161 bool
20162 aarch64_split_dimode_const_store (rtx dst, rtx src)
20163 {
20164   rtx lo = gen_lowpart (SImode, src);
20165   rtx hi = gen_highpart_mode (SImode, DImode, src);
20166
20167   bool size_p = optimize_function_for_size_p (cfun);
20168
20169   if (!rtx_equal_p (lo, hi))
20170     return false;
20171
20172   unsigned int orig_cost
20173     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
20174   unsigned int lo_cost
20175     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
20176
20177   /* We want to transform:
20178      MOV        x1, 49370
20179      MOVK       x1, 0x140, lsl 16
20180      MOVK       x1, 0xc0da, lsl 32
20181      MOVK       x1, 0x140, lsl 48
20182      STR        x1, [x0]
20183    into:
20184      MOV        w1, 49370
20185      MOVK       w1, 0x140, lsl 16
20186      STP        w1, w1, [x0]
20187    So we want to perform this only when we save two instructions
20188    or more.  When optimizing for size, however, accept any code size
20189    savings we can.  */
20190   if (size_p && orig_cost <= lo_cost)
20191     return false;
20192
20193   if (!size_p
20194       && (orig_cost <= lo_cost + 1))
20195     return false;
20196
20197   rtx mem_lo = adjust_address (dst, SImode, 0);
20198   if (!aarch64_mem_pair_operand (mem_lo, SImode))
20199     return false;
20200
20201   rtx tmp_reg = gen_reg_rtx (SImode);
20202   aarch64_expand_mov_immediate (tmp_reg, lo);
20203   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
20204   /* Don't emit an explicit store pair as this may not be always profitable.
20205      Let the sched-fusion logic decide whether to merge them.  */
20206   emit_move_insn (mem_lo, tmp_reg);
20207   emit_move_insn (mem_hi, tmp_reg);
20208
20209   return true;
20210 }
20211
20212 /* Generate RTL for a conditional branch with rtx comparison CODE in
20213    mode CC_MODE.  The destination of the unlikely conditional branch
20214    is LABEL_REF.  */
20215
20216 void
20217 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
20218                               rtx label_ref)
20219 {
20220   rtx x;
20221   x = gen_rtx_fmt_ee (code, VOIDmode,
20222                       gen_rtx_REG (cc_mode, CC_REGNUM),
20223                       const0_rtx);
20224
20225   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
20226                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
20227                             pc_rtx);
20228   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
20229 }
20230
20231 /* Generate DImode scratch registers for 128-bit (TImode) addition.
20232
20233    OP1 represents the TImode destination operand 1
20234    OP2 represents the TImode destination operand 2
20235    LOW_DEST represents the low half (DImode) of TImode operand 0
20236    LOW_IN1 represents the low half (DImode) of TImode operand 1
20237    LOW_IN2 represents the low half (DImode) of TImode operand 2
20238    HIGH_DEST represents the high half (DImode) of TImode operand 0
20239    HIGH_IN1 represents the high half (DImode) of TImode operand 1
20240    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
20241
20242 void
20243 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
20244                             rtx *low_in1, rtx *low_in2,
20245                             rtx *high_dest, rtx *high_in1,
20246                             rtx *high_in2)
20247 {
20248   *low_dest = gen_reg_rtx (DImode);
20249   *low_in1 = gen_lowpart (DImode, op1);
20250   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
20251                                   subreg_lowpart_offset (DImode, TImode));
20252   *high_dest = gen_reg_rtx (DImode);
20253   *high_in1 = gen_highpart (DImode, op1);
20254   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
20255                                    subreg_highpart_offset (DImode, TImode));
20256 }
20257
20258 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
20259
20260    This function differs from 'arch64_addti_scratch_regs' in that
20261    OP1 can be an immediate constant (zero). We must call
20262    subreg_highpart_offset with DImode and TImode arguments, otherwise
20263    VOIDmode will be used for the const_int which generates an internal
20264    error from subreg_size_highpart_offset which does not expect a size of zero.
20265
20266    OP1 represents the TImode destination operand 1
20267    OP2 represents the TImode destination operand 2
20268    LOW_DEST represents the low half (DImode) of TImode operand 0
20269    LOW_IN1 represents the low half (DImode) of TImode operand 1
20270    LOW_IN2 represents the low half (DImode) of TImode operand 2
20271    HIGH_DEST represents the high half (DImode) of TImode operand 0
20272    HIGH_IN1 represents the high half (DImode) of TImode operand 1
20273    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
20274
20275
20276 void
20277 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
20278                              rtx *low_in1, rtx *low_in2,
20279                              rtx *high_dest, rtx *high_in1,
20280                              rtx *high_in2)
20281 {
20282   *low_dest = gen_reg_rtx (DImode);
20283   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
20284                                   subreg_lowpart_offset (DImode, TImode));
20285
20286   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
20287                                   subreg_lowpart_offset (DImode, TImode));
20288   *high_dest = gen_reg_rtx (DImode);
20289
20290   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
20291                                    subreg_highpart_offset (DImode, TImode));
20292   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
20293                                    subreg_highpart_offset (DImode, TImode));
20294 }
20295
20296 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
20297
20298    OP0 represents the TImode destination operand 0
20299    LOW_DEST represents the low half (DImode) of TImode operand 0
20300    LOW_IN1 represents the low half (DImode) of TImode operand 1
20301    LOW_IN2 represents the low half (DImode) of TImode operand 2
20302    HIGH_DEST represents the high half (DImode) of TImode operand 0
20303    HIGH_IN1 represents the high half (DImode) of TImode operand 1
20304    HIGH_IN2 represents the high half (DImode) of TImode operand 2
20305    UNSIGNED_P is true if the operation is being performed on unsigned
20306    values.  */
20307 void
20308 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
20309                        rtx low_in2, rtx high_dest, rtx high_in1,
20310                        rtx high_in2, bool unsigned_p)
20311 {
20312   if (low_in2 == const0_rtx)
20313     {
20314       low_dest = low_in1;
20315       high_in2 = force_reg (DImode, high_in2);
20316       if (unsigned_p)
20317         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
20318       else
20319         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
20320     }
20321   else
20322     {
20323       if (aarch64_plus_immediate (low_in2, DImode))
20324         emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
20325                                             GEN_INT (-INTVAL (low_in2))));
20326       else
20327         {
20328           low_in2 = force_reg (DImode, low_in2);
20329           emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
20330         }
20331       high_in2 = force_reg (DImode, high_in2);
20332
20333       if (unsigned_p)
20334         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
20335       else
20336         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
20337     }
20338
20339   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
20340   emit_move_insn (gen_highpart (DImode, op0), high_dest);
20341
20342 }
20343
20344 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
20345
20346 static unsigned HOST_WIDE_INT
20347 aarch64_asan_shadow_offset (void)
20348 {
20349   if (TARGET_ILP32)
20350     return (HOST_WIDE_INT_1 << 29);
20351   else
20352     return (HOST_WIDE_INT_1 << 36);
20353 }
20354
20355 static rtx
20356 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
20357                         int code, tree treeop0, tree treeop1)
20358 {
20359   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
20360   rtx op0, op1;
20361   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
20362   insn_code icode;
20363   struct expand_operand ops[4];
20364
20365   start_sequence ();
20366   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
20367
20368   op_mode = GET_MODE (op0);
20369   if (op_mode == VOIDmode)
20370     op_mode = GET_MODE (op1);
20371
20372   switch (op_mode)
20373     {
20374     case E_QImode:
20375     case E_HImode:
20376     case E_SImode:
20377       cmp_mode = SImode;
20378       icode = CODE_FOR_cmpsi;
20379       break;
20380
20381     case E_DImode:
20382       cmp_mode = DImode;
20383       icode = CODE_FOR_cmpdi;
20384       break;
20385
20386     case E_SFmode:
20387       cmp_mode = SFmode;
20388       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
20389       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
20390       break;
20391
20392     case E_DFmode:
20393       cmp_mode = DFmode;
20394       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
20395       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
20396       break;
20397
20398     default:
20399       end_sequence ();
20400       return NULL_RTX;
20401     }
20402
20403   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
20404   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
20405   if (!op0 || !op1)
20406     {
20407       end_sequence ();
20408       return NULL_RTX;
20409     }
20410   *prep_seq = get_insns ();
20411   end_sequence ();
20412
20413   create_fixed_operand (&ops[0], op0);
20414   create_fixed_operand (&ops[1], op1);
20415
20416   start_sequence ();
20417   if (!maybe_expand_insn (icode, 2, ops))
20418     {
20419       end_sequence ();
20420       return NULL_RTX;
20421     }
20422   *gen_seq = get_insns ();
20423   end_sequence ();
20424
20425   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
20426                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
20427 }
20428
20429 static rtx
20430 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
20431                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
20432 {
20433   rtx op0, op1, target;
20434   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
20435   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
20436   insn_code icode;
20437   struct expand_operand ops[6];
20438   int aarch64_cond;
20439
20440   push_to_sequence (*prep_seq);
20441   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
20442
20443   op_mode = GET_MODE (op0);
20444   if (op_mode == VOIDmode)
20445     op_mode = GET_MODE (op1);
20446
20447   switch (op_mode)
20448     {
20449     case E_QImode:
20450     case E_HImode:
20451     case E_SImode:
20452       cmp_mode = SImode;
20453       break;
20454
20455     case E_DImode:
20456       cmp_mode = DImode;
20457       break;
20458
20459     case E_SFmode:
20460       cmp_mode = SFmode;
20461       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
20462       break;
20463
20464     case E_DFmode:
20465       cmp_mode = DFmode;
20466       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
20467       break;
20468
20469     default:
20470       end_sequence ();
20471       return NULL_RTX;
20472     }
20473
20474   icode = code_for_ccmp (cc_mode, cmp_mode);
20475
20476   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
20477   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
20478   if (!op0 || !op1)
20479     {
20480       end_sequence ();
20481       return NULL_RTX;
20482     }
20483   *prep_seq = get_insns ();
20484   end_sequence ();
20485
20486   target = gen_rtx_REG (cc_mode, CC_REGNUM);
20487   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
20488
20489   if (bit_code != AND)
20490     {
20491       /* Treat the ccmp patterns as canonical and use them where possible,
20492          but fall back to ccmp_rev patterns if there's no other option.  */
20493       rtx_code prev_code = GET_CODE (prev);
20494       machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
20495       if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
20496           && !(prev_code == EQ
20497                || prev_code == NE
20498                || prev_code == ORDERED
20499                || prev_code == UNORDERED))
20500         icode = code_for_ccmp_rev (cc_mode, cmp_mode);
20501       else
20502         {
20503           rtx_code code = reverse_condition (prev_code);
20504           prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
20505         }
20506       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
20507     }
20508
20509   create_fixed_operand (&ops[0], XEXP (prev, 0));
20510   create_fixed_operand (&ops[1], target);
20511   create_fixed_operand (&ops[2], op0);
20512   create_fixed_operand (&ops[3], op1);
20513   create_fixed_operand (&ops[4], prev);
20514   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
20515
20516   push_to_sequence (*gen_seq);
20517   if (!maybe_expand_insn (icode, 6, ops))
20518     {
20519       end_sequence ();
20520       return NULL_RTX;
20521     }
20522
20523   *gen_seq = get_insns ();
20524   end_sequence ();
20525
20526   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
20527 }
20528
20529 #undef TARGET_GEN_CCMP_FIRST
20530 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
20531
20532 #undef TARGET_GEN_CCMP_NEXT
20533 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
20534
20535 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
20536    instruction fusion of some sort.  */
20537
20538 static bool
20539 aarch64_macro_fusion_p (void)
20540 {
20541   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
20542 }
20543
20544
20545 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
20546    should be kept together during scheduling.  */
20547
20548 static bool
20549 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
20550 {
20551   rtx set_dest;
20552   rtx prev_set = single_set (prev);
20553   rtx curr_set = single_set (curr);
20554   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
20555   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
20556
20557   if (!aarch64_macro_fusion_p ())
20558     return false;
20559
20560   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
20561     {
20562       /* We are trying to match:
20563          prev (mov)  == (set (reg r0) (const_int imm16))
20564          curr (movk) == (set (zero_extract (reg r0)
20565                                            (const_int 16)
20566                                            (const_int 16))
20567                              (const_int imm16_1))  */
20568
20569       set_dest = SET_DEST (curr_set);
20570
20571       if (GET_CODE (set_dest) == ZERO_EXTRACT
20572           && CONST_INT_P (SET_SRC (curr_set))
20573           && CONST_INT_P (SET_SRC (prev_set))
20574           && CONST_INT_P (XEXP (set_dest, 2))
20575           && INTVAL (XEXP (set_dest, 2)) == 16
20576           && REG_P (XEXP (set_dest, 0))
20577           && REG_P (SET_DEST (prev_set))
20578           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
20579         {
20580           return true;
20581         }
20582     }
20583
20584   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
20585     {
20586
20587       /*  We're trying to match:
20588           prev (adrp) == (set (reg r1)
20589                               (high (symbol_ref ("SYM"))))
20590           curr (add) == (set (reg r0)
20591                              (lo_sum (reg r1)
20592                                      (symbol_ref ("SYM"))))
20593           Note that r0 need not necessarily be the same as r1, especially
20594           during pre-regalloc scheduling.  */
20595
20596       if (satisfies_constraint_Ush (SET_SRC (prev_set))
20597           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
20598         {
20599           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
20600               && REG_P (XEXP (SET_SRC (curr_set), 0))
20601               && REGNO (XEXP (SET_SRC (curr_set), 0))
20602                  == REGNO (SET_DEST (prev_set))
20603               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
20604                               XEXP (SET_SRC (curr_set), 1)))
20605             return true;
20606         }
20607     }
20608
20609   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
20610     {
20611
20612       /* We're trying to match:
20613          prev (movk) == (set (zero_extract (reg r0)
20614                                            (const_int 16)
20615                                            (const_int 32))
20616                              (const_int imm16_1))
20617          curr (movk) == (set (zero_extract (reg r0)
20618                                            (const_int 16)
20619                                            (const_int 48))
20620                              (const_int imm16_2))  */
20621
20622       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
20623           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
20624           && REG_P (XEXP (SET_DEST (prev_set), 0))
20625           && REG_P (XEXP (SET_DEST (curr_set), 0))
20626           && REGNO (XEXP (SET_DEST (prev_set), 0))
20627              == REGNO (XEXP (SET_DEST (curr_set), 0))
20628           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
20629           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
20630           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
20631           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
20632           && CONST_INT_P (SET_SRC (prev_set))
20633           && CONST_INT_P (SET_SRC (curr_set)))
20634         return true;
20635
20636     }
20637   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
20638     {
20639       /* We're trying to match:
20640           prev (adrp) == (set (reg r0)
20641                               (high (symbol_ref ("SYM"))))
20642           curr (ldr) == (set (reg r1)
20643                              (mem (lo_sum (reg r0)
20644                                              (symbol_ref ("SYM")))))
20645                  or
20646           curr (ldr) == (set (reg r1)
20647                              (zero_extend (mem
20648                                            (lo_sum (reg r0)
20649                                                    (symbol_ref ("SYM"))))))  */
20650       if (satisfies_constraint_Ush (SET_SRC (prev_set))
20651           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
20652         {
20653           rtx curr_src = SET_SRC (curr_set);
20654
20655           if (GET_CODE (curr_src) == ZERO_EXTEND)
20656             curr_src = XEXP (curr_src, 0);
20657
20658           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
20659               && REG_P (XEXP (XEXP (curr_src, 0), 0))
20660               && REGNO (XEXP (XEXP (curr_src, 0), 0))
20661                  == REGNO (SET_DEST (prev_set))
20662               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
20663                               XEXP (SET_SRC (prev_set), 0)))
20664               return true;
20665         }
20666     }
20667
20668   /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch.  */
20669   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
20670       && prev_set && curr_set && any_condjump_p (curr)
20671       && GET_CODE (SET_SRC (prev_set)) == COMPARE
20672       && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
20673       && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
20674     return true;
20675
20676   /* Fuse flag-setting ALU instructions and conditional branch.  */
20677   if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
20678       && any_condjump_p (curr))
20679     {
20680       unsigned int condreg1, condreg2;
20681       rtx cc_reg_1;
20682       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
20683       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
20684
20685       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
20686           && prev
20687           && modified_in_p (cc_reg_1, prev))
20688         {
20689           enum attr_type prev_type = get_attr_type (prev);
20690
20691           /* FIXME: this misses some which is considered simple arthematic
20692              instructions for ThunderX.  Simple shifts are missed here.  */
20693           if (prev_type == TYPE_ALUS_SREG
20694               || prev_type == TYPE_ALUS_IMM
20695               || prev_type == TYPE_LOGICS_REG
20696               || prev_type == TYPE_LOGICS_IMM)
20697             return true;
20698         }
20699     }
20700
20701   /* Fuse ALU instructions and CBZ/CBNZ.  */
20702   if (prev_set
20703       && curr_set
20704       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
20705       && any_condjump_p (curr))
20706     {
20707       /* We're trying to match:
20708           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
20709           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
20710                                                          (const_int 0))
20711                                                  (label_ref ("SYM"))
20712                                                  (pc))  */
20713       if (SET_DEST (curr_set) == (pc_rtx)
20714           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
20715           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
20716           && REG_P (SET_DEST (prev_set))
20717           && REGNO (SET_DEST (prev_set))
20718              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
20719         {
20720           /* Fuse ALU operations followed by conditional branch instruction.  */
20721           switch (get_attr_type (prev))
20722             {
20723             case TYPE_ALU_IMM:
20724             case TYPE_ALU_SREG:
20725             case TYPE_ADC_REG:
20726             case TYPE_ADC_IMM:
20727             case TYPE_ADCS_REG:
20728             case TYPE_ADCS_IMM:
20729             case TYPE_LOGIC_REG:
20730             case TYPE_LOGIC_IMM:
20731             case TYPE_CSEL:
20732             case TYPE_ADR:
20733             case TYPE_MOV_IMM:
20734             case TYPE_SHIFT_REG:
20735             case TYPE_SHIFT_IMM:
20736             case TYPE_BFM:
20737             case TYPE_RBIT:
20738             case TYPE_REV:
20739             case TYPE_EXTEND:
20740               return true;
20741
20742             default:;
20743             }
20744         }
20745     }
20746
20747   return false;
20748 }
20749
20750 /* Return true iff the instruction fusion described by OP is enabled.  */
20751
20752 bool
20753 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
20754 {
20755   return (aarch64_tune_params.fusible_ops & op) != 0;
20756 }
20757
20758 /* If MEM is in the form of [base+offset], extract the two parts
20759    of address and set to BASE and OFFSET, otherwise return false
20760    after clearing BASE and OFFSET.  */
20761
20762 bool
20763 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
20764 {
20765   rtx addr;
20766
20767   gcc_assert (MEM_P (mem));
20768
20769   addr = XEXP (mem, 0);
20770
20771   if (REG_P (addr))
20772     {
20773       *base = addr;
20774       *offset = const0_rtx;
20775       return true;
20776     }
20777
20778   if (GET_CODE (addr) == PLUS
20779       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
20780     {
20781       *base = XEXP (addr, 0);
20782       *offset = XEXP (addr, 1);
20783       return true;
20784     }
20785
20786   *base = NULL_RTX;
20787   *offset = NULL_RTX;
20788
20789   return false;
20790 }
20791
20792 /* Types for scheduling fusion.  */
20793 enum sched_fusion_type
20794 {
20795   SCHED_FUSION_NONE = 0,
20796   SCHED_FUSION_LD_SIGN_EXTEND,
20797   SCHED_FUSION_LD_ZERO_EXTEND,
20798   SCHED_FUSION_LD,
20799   SCHED_FUSION_ST,
20800   SCHED_FUSION_NUM
20801 };
20802
20803 /* If INSN is a load or store of address in the form of [base+offset],
20804    extract the two parts and set to BASE and OFFSET.  Return scheduling
20805    fusion type this INSN is.  */
20806
20807 static enum sched_fusion_type
20808 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
20809 {
20810   rtx x, dest, src;
20811   enum sched_fusion_type fusion = SCHED_FUSION_LD;
20812
20813   gcc_assert (INSN_P (insn));
20814   x = PATTERN (insn);
20815   if (GET_CODE (x) != SET)
20816     return SCHED_FUSION_NONE;
20817
20818   src = SET_SRC (x);
20819   dest = SET_DEST (x);
20820
20821   machine_mode dest_mode = GET_MODE (dest);
20822
20823   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
20824     return SCHED_FUSION_NONE;
20825
20826   if (GET_CODE (src) == SIGN_EXTEND)
20827     {
20828       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
20829       src = XEXP (src, 0);
20830       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
20831         return SCHED_FUSION_NONE;
20832     }
20833   else if (GET_CODE (src) == ZERO_EXTEND)
20834     {
20835       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
20836       src = XEXP (src, 0);
20837       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
20838         return SCHED_FUSION_NONE;
20839     }
20840
20841   if (GET_CODE (src) == MEM && REG_P (dest))
20842     extract_base_offset_in_addr (src, base, offset);
20843   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
20844     {
20845       fusion = SCHED_FUSION_ST;
20846       extract_base_offset_in_addr (dest, base, offset);
20847     }
20848   else
20849     return SCHED_FUSION_NONE;
20850
20851   if (*base == NULL_RTX || *offset == NULL_RTX)
20852     fusion = SCHED_FUSION_NONE;
20853
20854   return fusion;
20855 }
20856
20857 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
20858
20859    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
20860    and PRI are only calculated for these instructions.  For other instruction,
20861    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
20862    type instruction fusion can be added by returning different priorities.
20863
20864    It's important that irrelevant instructions get the largest FUSION_PRI.  */
20865
20866 static void
20867 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
20868                                int *fusion_pri, int *pri)
20869 {
20870   int tmp, off_val;
20871   rtx base, offset;
20872   enum sched_fusion_type fusion;
20873
20874   gcc_assert (INSN_P (insn));
20875
20876   tmp = max_pri - 1;
20877   fusion = fusion_load_store (insn, &base, &offset);
20878   if (fusion == SCHED_FUSION_NONE)
20879     {
20880       *pri = tmp;
20881       *fusion_pri = tmp;
20882       return;
20883     }
20884
20885   /* Set FUSION_PRI according to fusion type and base register.  */
20886   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
20887
20888   /* Calculate PRI.  */
20889   tmp /= 2;
20890
20891   /* INSN with smaller offset goes first.  */
20892   off_val = (int)(INTVAL (offset));
20893   if (off_val >= 0)
20894     tmp -= (off_val & 0xfffff);
20895   else
20896     tmp += ((- off_val) & 0xfffff);
20897
20898   *pri = tmp;
20899   return;
20900 }
20901
20902 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
20903    Adjust priority of sha1h instructions so they are scheduled before
20904    other SHA1 instructions.  */
20905
20906 static int
20907 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
20908 {
20909   rtx x = PATTERN (insn);
20910
20911   if (GET_CODE (x) == SET)
20912     {
20913       x = SET_SRC (x);
20914
20915       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
20916         return priority + 10;
20917     }
20918
20919   return priority;
20920 }
20921
20922 /* Given OPERANDS of consecutive load/store, check if we can merge
20923    them into ldp/stp.  LOAD is true if they are load instructions.
20924    MODE is the mode of memory operands.  */
20925
20926 bool
20927 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
20928                                 machine_mode mode)
20929 {
20930   HOST_WIDE_INT offval_1, offval_2, msize;
20931   enum reg_class rclass_1, rclass_2;
20932   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
20933
20934   if (load)
20935     {
20936       mem_1 = operands[1];
20937       mem_2 = operands[3];
20938       reg_1 = operands[0];
20939       reg_2 = operands[2];
20940       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
20941       if (REGNO (reg_1) == REGNO (reg_2))
20942         return false;
20943     }
20944   else
20945     {
20946       mem_1 = operands[0];
20947       mem_2 = operands[2];
20948       reg_1 = operands[1];
20949       reg_2 = operands[3];
20950     }
20951
20952   /* The mems cannot be volatile.  */
20953   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
20954     return false;
20955
20956   /* If we have SImode and slow unaligned ldp,
20957      check the alignment to be at least 8 byte. */
20958   if (mode == SImode
20959       && (aarch64_tune_params.extra_tuning_flags
20960           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
20961       && !optimize_size
20962       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
20963     return false;
20964
20965   /* Check if the addresses are in the form of [base+offset].  */
20966   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
20967   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
20968     return false;
20969   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
20970   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
20971     return false;
20972
20973   /* Check if the bases are same.  */
20974   if (!rtx_equal_p (base_1, base_2))
20975     return false;
20976
20977   /* The operands must be of the same size.  */
20978   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
20979                          GET_MODE_SIZE (GET_MODE (mem_2))));
20980
20981   offval_1 = INTVAL (offset_1);
20982   offval_2 = INTVAL (offset_2);
20983   /* We should only be trying this for fixed-sized modes.  There is no
20984      SVE LDP/STP instruction.  */
20985   msize = GET_MODE_SIZE (mode).to_constant ();
20986   /* Check if the offsets are consecutive.  */
20987   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
20988     return false;
20989
20990   /* Check if the addresses are clobbered by load.  */
20991   if (load)
20992     {
20993       if (reg_mentioned_p (reg_1, mem_1))
20994         return false;
20995
20996       /* In increasing order, the last load can clobber the address.  */
20997       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
20998         return false;
20999     }
21000
21001   /* One of the memory accesses must be a mempair operand.
21002      If it is not the first one, they need to be swapped by the
21003      peephole.  */
21004   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
21005        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
21006     return false;
21007
21008   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
21009     rclass_1 = FP_REGS;
21010   else
21011     rclass_1 = GENERAL_REGS;
21012
21013   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
21014     rclass_2 = FP_REGS;
21015   else
21016     rclass_2 = GENERAL_REGS;
21017
21018   /* Check if the registers are of same class.  */
21019   if (rclass_1 != rclass_2)
21020     return false;
21021
21022   return true;
21023 }
21024
21025 /* Given OPERANDS of consecutive load/store that can be merged,
21026    swap them if they are not in ascending order.  */
21027 void
21028 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
21029 {
21030   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
21031   HOST_WIDE_INT offval_1, offval_2;
21032
21033   if (load)
21034     {
21035       mem_1 = operands[1];
21036       mem_2 = operands[3];
21037     }
21038   else
21039     {
21040       mem_1 = operands[0];
21041       mem_2 = operands[2];
21042     }
21043
21044   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
21045   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
21046
21047   offval_1 = INTVAL (offset_1);
21048   offval_2 = INTVAL (offset_2);
21049
21050   if (offval_1 > offval_2)
21051     {
21052       /* Irrespective of whether this is a load or a store,
21053          we do the same swap.  */
21054       std::swap (operands[0], operands[2]);
21055       std::swap (operands[1], operands[3]);
21056     }
21057 }
21058
21059 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
21060    comparison between the two.  */
21061 int
21062 aarch64_host_wide_int_compare (const void *x, const void *y)
21063 {
21064   return wi::cmps (* ((const HOST_WIDE_INT *) x),
21065                    * ((const HOST_WIDE_INT *) y));
21066 }
21067
21068 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
21069    other pointing to a REG rtx containing an offset, compare the offsets
21070    of the two pairs.
21071
21072    Return:
21073
21074         1 iff offset (X) > offset (Y)
21075         0 iff offset (X) == offset (Y)
21076         -1 iff offset (X) < offset (Y)  */
21077 int
21078 aarch64_ldrstr_offset_compare (const void *x, const void *y)
21079 {
21080   const rtx * operands_1 = (const rtx *) x;
21081   const rtx * operands_2 = (const rtx *) y;
21082   rtx mem_1, mem_2, base, offset_1, offset_2;
21083
21084   if (MEM_P (operands_1[0]))
21085     mem_1 = operands_1[0];
21086   else
21087     mem_1 = operands_1[1];
21088
21089   if (MEM_P (operands_2[0]))
21090     mem_2 = operands_2[0];
21091   else
21092     mem_2 = operands_2[1];
21093
21094   /* Extract the offsets.  */
21095   extract_base_offset_in_addr (mem_1, &base, &offset_1);
21096   extract_base_offset_in_addr (mem_2, &base, &offset_2);
21097
21098   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
21099
21100   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
21101 }
21102
21103 /* Given OPERANDS of consecutive load/store, check if we can merge
21104    them into ldp/stp by adjusting the offset.  LOAD is true if they
21105    are load instructions.  MODE is the mode of memory operands.
21106
21107    Given below consecutive stores:
21108
21109      str  w1, [xb, 0x100]
21110      str  w1, [xb, 0x104]
21111      str  w1, [xb, 0x108]
21112      str  w1, [xb, 0x10c]
21113
21114    Though the offsets are out of the range supported by stp, we can
21115    still pair them after adjusting the offset, like:
21116
21117      add  scratch, xb, 0x100
21118      stp  w1, w1, [scratch]
21119      stp  w1, w1, [scratch, 0x8]
21120
21121    The peephole patterns detecting this opportunity should guarantee
21122    the scratch register is avaliable.  */
21123
21124 bool
21125 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
21126                                        scalar_mode mode)
21127 {
21128   const int num_insns = 4;
21129   enum reg_class rclass;
21130   HOST_WIDE_INT offvals[num_insns], msize;
21131   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
21132
21133   if (load)
21134     {
21135       for (int i = 0; i < num_insns; i++)
21136         {
21137           reg[i] = operands[2 * i];
21138           mem[i] = operands[2 * i + 1];
21139
21140           gcc_assert (REG_P (reg[i]));
21141         }
21142
21143       /* Do not attempt to merge the loads if the loads clobber each other.  */
21144       for (int i = 0; i < 8; i += 2)
21145         for (int j = i + 2; j < 8; j += 2)
21146           if (reg_overlap_mentioned_p (operands[i], operands[j]))
21147             return false;
21148     }
21149   else
21150     for (int i = 0; i < num_insns; i++)
21151       {
21152         mem[i] = operands[2 * i];
21153         reg[i] = operands[2 * i + 1];
21154       }
21155
21156   /* Skip if memory operand is by itself valid for ldp/stp.  */
21157   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
21158     return false;
21159
21160   for (int i = 0; i < num_insns; i++)
21161     {
21162       /* The mems cannot be volatile.  */
21163       if (MEM_VOLATILE_P (mem[i]))
21164         return false;
21165
21166       /* Check if the addresses are in the form of [base+offset].  */
21167       extract_base_offset_in_addr (mem[i], base + i, offset + i);
21168       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
21169         return false;
21170     }
21171
21172   /* Check if the registers are of same class.  */
21173   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
21174     ? FP_REGS : GENERAL_REGS;
21175
21176   for (int i = 1; i < num_insns; i++)
21177     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
21178       {
21179         if (rclass != FP_REGS)
21180           return false;
21181       }
21182     else
21183       {
21184         if (rclass != GENERAL_REGS)
21185           return false;
21186       }
21187
21188   /* Only the last register in the order in which they occur
21189      may be clobbered by the load.  */
21190   if (rclass == GENERAL_REGS && load)
21191     for (int i = 0; i < num_insns - 1; i++)
21192       if (reg_mentioned_p (reg[i], mem[i]))
21193         return false;
21194
21195   /* Check if the bases are same.  */
21196   for (int i = 0; i < num_insns - 1; i++)
21197     if (!rtx_equal_p (base[i], base[i + 1]))
21198       return false;
21199
21200   for (int i = 0; i < num_insns; i++)
21201     offvals[i] = INTVAL (offset[i]);
21202
21203   msize = GET_MODE_SIZE (mode);
21204
21205   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
21206   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
21207          aarch64_host_wide_int_compare);
21208
21209   if (!(offvals[1] == offvals[0] + msize
21210         && offvals[3] == offvals[2] + msize))
21211     return false;
21212
21213   /* Check that offsets are within range of each other.  The ldp/stp
21214      instructions have 7 bit immediate offsets, so use 0x80.  */
21215   if (offvals[2] - offvals[0] >= msize * 0x80)
21216     return false;
21217
21218   /* The offsets must be aligned with respect to each other.  */
21219   if (offvals[0] % msize != offvals[2] % msize)
21220     return false;
21221
21222   /* If we have SImode and slow unaligned ldp,
21223      check the alignment to be at least 8 byte. */
21224   if (mode == SImode
21225       && (aarch64_tune_params.extra_tuning_flags
21226           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
21227       && !optimize_size
21228       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
21229     return false;
21230
21231   return true;
21232 }
21233
21234 /* Given OPERANDS of consecutive load/store, this function pairs them
21235    into LDP/STP after adjusting the offset.  It depends on the fact
21236    that the operands can be sorted so the offsets are correct for STP.
21237    MODE is the mode of memory operands.  CODE is the rtl operator
21238    which should be applied to all memory operands, it's SIGN_EXTEND,
21239    ZERO_EXTEND or UNKNOWN.  */
21240
21241 bool
21242 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
21243                              scalar_mode mode, RTX_CODE code)
21244 {
21245   rtx base, offset_1, offset_3, t1, t2;
21246   rtx mem_1, mem_2, mem_3, mem_4;
21247   rtx temp_operands[8];
21248   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
21249                 stp_off_upper_limit, stp_off_lower_limit, msize;
21250
21251   /* We make changes on a copy as we may still bail out.  */
21252   for (int i = 0; i < 8; i ++)
21253     temp_operands[i] = operands[i];
21254
21255   /* Sort the operands.  */
21256   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
21257
21258   /* Copy the memory operands so that if we have to bail for some
21259      reason the original addresses are unchanged.  */
21260   if (load)
21261     {
21262       mem_1 = copy_rtx (temp_operands[1]);
21263       mem_2 = copy_rtx (temp_operands[3]);
21264       mem_3 = copy_rtx (temp_operands[5]);
21265       mem_4 = copy_rtx (temp_operands[7]);
21266     }
21267   else
21268     {
21269       mem_1 = copy_rtx (temp_operands[0]);
21270       mem_2 = copy_rtx (temp_operands[2]);
21271       mem_3 = copy_rtx (temp_operands[4]);
21272       mem_4 = copy_rtx (temp_operands[6]);
21273       gcc_assert (code == UNKNOWN);
21274     }
21275
21276   extract_base_offset_in_addr (mem_1, &base, &offset_1);
21277   extract_base_offset_in_addr (mem_3, &base, &offset_3);
21278   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
21279               && offset_3 != NULL_RTX);
21280
21281   /* Adjust offset so it can fit in LDP/STP instruction.  */
21282   msize = GET_MODE_SIZE (mode);
21283   stp_off_upper_limit = msize * (0x40 - 1);
21284   stp_off_lower_limit = - msize * 0x40;
21285
21286   off_val_1 = INTVAL (offset_1);
21287   off_val_3 = INTVAL (offset_3);
21288
21289   /* The base offset is optimally half way between the two STP/LDP offsets.  */
21290   if (msize <= 4)
21291     base_off = (off_val_1 + off_val_3) / 2;
21292   else
21293     /* However, due to issues with negative LDP/STP offset generation for
21294        larger modes, for DF, DI and vector modes. we must not use negative
21295        addresses smaller than 9 signed unadjusted bits can store.  This
21296        provides the most range in this case.  */
21297     base_off = off_val_1;
21298
21299   /* Adjust the base so that it is aligned with the addresses but still
21300      optimal.  */
21301   if (base_off % msize != off_val_1 % msize)
21302     /* Fix the offset, bearing in mind we want to make it bigger not
21303        smaller.  */
21304     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
21305   else if (msize <= 4)
21306     /* The negative range of LDP/STP is one larger than the positive range.  */
21307     base_off += msize;
21308
21309   /* Check if base offset is too big or too small.  We can attempt to resolve
21310      this issue by setting it to the maximum value and seeing if the offsets
21311      still fit.  */
21312   if (base_off >= 0x1000)
21313     {
21314       base_off = 0x1000 - 1;
21315       /* We must still make sure that the base offset is aligned with respect
21316          to the address.  But it may not be made any bigger.  */
21317       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
21318     }
21319
21320   /* Likewise for the case where the base is too small.  */
21321   if (base_off <= -0x1000)
21322     {
21323       base_off = -0x1000 + 1;
21324       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
21325     }
21326
21327   /* Offset of the first STP/LDP.  */
21328   new_off_1 = off_val_1 - base_off;
21329
21330   /* Offset of the second STP/LDP.  */
21331   new_off_3 = off_val_3 - base_off;
21332
21333   /* The offsets must be within the range of the LDP/STP instructions.  */
21334   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
21335       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
21336     return false;
21337
21338   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
21339                                                   new_off_1), true);
21340   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
21341                                                   new_off_1 + msize), true);
21342   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
21343                                                   new_off_3), true);
21344   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
21345                                                   new_off_3 + msize), true);
21346
21347   if (!aarch64_mem_pair_operand (mem_1, mode)
21348       || !aarch64_mem_pair_operand (mem_3, mode))
21349     return false;
21350
21351   if (code == ZERO_EXTEND)
21352     {
21353       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
21354       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
21355       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
21356       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
21357     }
21358   else if (code == SIGN_EXTEND)
21359     {
21360       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
21361       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
21362       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
21363       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
21364     }
21365
21366   if (load)
21367     {
21368       operands[0] = temp_operands[0];
21369       operands[1] = mem_1;
21370       operands[2] = temp_operands[2];
21371       operands[3] = mem_2;
21372       operands[4] = temp_operands[4];
21373       operands[5] = mem_3;
21374       operands[6] = temp_operands[6];
21375       operands[7] = mem_4;
21376     }
21377   else
21378     {
21379       operands[0] = mem_1;
21380       operands[1] = temp_operands[1];
21381       operands[2] = mem_2;
21382       operands[3] = temp_operands[3];
21383       operands[4] = mem_3;
21384       operands[5] = temp_operands[5];
21385       operands[6] = mem_4;
21386       operands[7] = temp_operands[7];
21387     }
21388
21389   /* Emit adjusting instruction.  */
21390   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
21391   /* Emit ldp/stp instructions.  */
21392   t1 = gen_rtx_SET (operands[0], operands[1]);
21393   t2 = gen_rtx_SET (operands[2], operands[3]);
21394   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
21395   t1 = gen_rtx_SET (operands[4], operands[5]);
21396   t2 = gen_rtx_SET (operands[6], operands[7]);
21397   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
21398   return true;
21399 }
21400
21401 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
21402    it isn't worth branching around empty masked ops (including masked
21403    stores).  */
21404
21405 static bool
21406 aarch64_empty_mask_is_expensive (unsigned)
21407 {
21408   return false;
21409 }
21410
21411 /* Return 1 if pseudo register should be created and used to hold
21412    GOT address for PIC code.  */
21413
21414 bool
21415 aarch64_use_pseudo_pic_reg (void)
21416 {
21417   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
21418 }
21419
21420 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
21421
21422 static int
21423 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
21424 {
21425   switch (XINT (x, 1))
21426     {
21427     case UNSPEC_GOTSMALLPIC:
21428     case UNSPEC_GOTSMALLPIC28K:
21429     case UNSPEC_GOTTINYPIC:
21430       return 0;
21431     default:
21432       break;
21433     }
21434
21435   return default_unspec_may_trap_p (x, flags);
21436 }
21437
21438
21439 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
21440    return the log2 of that value.  Otherwise return -1.  */
21441
21442 int
21443 aarch64_fpconst_pow_of_2 (rtx x)
21444 {
21445   const REAL_VALUE_TYPE *r;
21446
21447   if (!CONST_DOUBLE_P (x))
21448     return -1;
21449
21450   r = CONST_DOUBLE_REAL_VALUE (x);
21451
21452   if (REAL_VALUE_NEGATIVE (*r)
21453       || REAL_VALUE_ISNAN (*r)
21454       || REAL_VALUE_ISINF (*r)
21455       || !real_isinteger (r, DFmode))
21456     return -1;
21457
21458   return exact_log2 (real_to_integer (r));
21459 }
21460
21461 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
21462    power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
21463    return n. Otherwise return -1.  */
21464
21465 int
21466 aarch64_fpconst_pow2_recip (rtx x)
21467 {
21468   REAL_VALUE_TYPE r0;
21469
21470   if (!CONST_DOUBLE_P (x))
21471     return -1;
21472
21473   r0 = *CONST_DOUBLE_REAL_VALUE (x);
21474   if (exact_real_inverse (DFmode, &r0)
21475       && !REAL_VALUE_NEGATIVE (r0))
21476     {
21477         int ret = exact_log2 (real_to_integer (&r0));
21478         if (ret >= 1 && ret <= 32)
21479             return ret;
21480     }
21481   return -1;
21482 }
21483
21484 /* If X is a vector of equal CONST_DOUBLE values and that value is
21485    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
21486
21487 int
21488 aarch64_vec_fpconst_pow_of_2 (rtx x)
21489 {
21490   int nelts;
21491   if (GET_CODE (x) != CONST_VECTOR
21492       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
21493     return -1;
21494
21495   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
21496     return -1;
21497
21498   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
21499   if (firstval <= 0)
21500     return -1;
21501
21502   for (int i = 1; i < nelts; i++)
21503     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
21504       return -1;
21505
21506   return firstval;
21507 }
21508
21509 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
21510    to float.
21511
21512    __fp16 always promotes through this hook.
21513    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
21514    through the generic excess precision logic rather than here.  */
21515
21516 static tree
21517 aarch64_promoted_type (const_tree t)
21518 {
21519   if (SCALAR_FLOAT_TYPE_P (t)
21520       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
21521     return float_type_node;
21522
21523   return NULL_TREE;
21524 }
21525
21526 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
21527
21528 static bool
21529 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
21530                            optimization_type opt_type)
21531 {
21532   switch (op)
21533     {
21534     case rsqrt_optab:
21535       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
21536
21537     default:
21538       return true;
21539     }
21540 }
21541
21542 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
21543
21544 static unsigned int
21545 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
21546                                         int *offset)
21547 {
21548   /* Polynomial invariant 1 == (VG / 2) - 1.  */
21549   gcc_assert (i == 1);
21550   *factor = 2;
21551   *offset = 1;
21552   return AARCH64_DWARF_VG;
21553 }
21554
21555 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
21556    if MODE is HFmode, and punt to the generic implementation otherwise.  */
21557
21558 static bool
21559 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
21560 {
21561   return (mode == HFmode
21562           ? true
21563           : default_libgcc_floating_mode_supported_p (mode));
21564 }
21565
21566 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
21567    if MODE is HFmode, and punt to the generic implementation otherwise.  */
21568
21569 static bool
21570 aarch64_scalar_mode_supported_p (scalar_mode mode)
21571 {
21572   return (mode == HFmode
21573           ? true
21574           : default_scalar_mode_supported_p (mode));
21575 }
21576
21577 /* Set the value of FLT_EVAL_METHOD.
21578    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
21579
21580     0: evaluate all operations and constants, whose semantic type has at
21581        most the range and precision of type float, to the range and
21582        precision of float; evaluate all other operations and constants to
21583        the range and precision of the semantic type;
21584
21585     N, where _FloatN is a supported interchange floating type
21586        evaluate all operations and constants, whose semantic type has at
21587        most the range and precision of _FloatN type, to the range and
21588        precision of the _FloatN type; evaluate all other operations and
21589        constants to the range and precision of the semantic type;
21590
21591    If we have the ARMv8.2-A extensions then we support _Float16 in native
21592    precision, so we should set this to 16.  Otherwise, we support the type,
21593    but want to evaluate expressions in float precision, so set this to
21594    0.  */
21595
21596 static enum flt_eval_method
21597 aarch64_excess_precision (enum excess_precision_type type)
21598 {
21599   switch (type)
21600     {
21601       case EXCESS_PRECISION_TYPE_FAST:
21602       case EXCESS_PRECISION_TYPE_STANDARD:
21603         /* We can calculate either in 16-bit range and precision or
21604            32-bit range and precision.  Make that decision based on whether
21605            we have native support for the ARMv8.2-A 16-bit floating-point
21606            instructions or not.  */
21607         return (TARGET_FP_F16INST
21608                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
21609                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
21610       case EXCESS_PRECISION_TYPE_IMPLICIT:
21611         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
21612       default:
21613         gcc_unreachable ();
21614     }
21615   return FLT_EVAL_METHOD_UNPREDICTABLE;
21616 }
21617
21618 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
21619    scheduled for speculative execution.  Reject the long-running division
21620    and square-root instructions.  */
21621
21622 static bool
21623 aarch64_sched_can_speculate_insn (rtx_insn *insn)
21624 {
21625   switch (get_attr_type (insn))
21626     {
21627       case TYPE_SDIV:
21628       case TYPE_UDIV:
21629       case TYPE_FDIVS:
21630       case TYPE_FDIVD:
21631       case TYPE_FSQRTS:
21632       case TYPE_FSQRTD:
21633       case TYPE_NEON_FP_SQRT_S:
21634       case TYPE_NEON_FP_SQRT_D:
21635       case TYPE_NEON_FP_SQRT_S_Q:
21636       case TYPE_NEON_FP_SQRT_D_Q:
21637       case TYPE_NEON_FP_DIV_S:
21638       case TYPE_NEON_FP_DIV_D:
21639       case TYPE_NEON_FP_DIV_S_Q:
21640       case TYPE_NEON_FP_DIV_D_Q:
21641         return false;
21642       default:
21643         return true;
21644     }
21645 }
21646
21647 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
21648
21649 static int
21650 aarch64_compute_pressure_classes (reg_class *classes)
21651 {
21652   int i = 0;
21653   classes[i++] = GENERAL_REGS;
21654   classes[i++] = FP_REGS;
21655   /* PR_REGS isn't a useful pressure class because many predicate pseudo
21656      registers need to go in PR_LO_REGS at some point during their
21657      lifetime.  Splitting it into two halves has the effect of making
21658      all predicates count against PR_LO_REGS, so that we try whenever
21659      possible to restrict the number of live predicates to 8.  This
21660      greatly reduces the amount of spilling in certain loops.  */
21661   classes[i++] = PR_LO_REGS;
21662   classes[i++] = PR_HI_REGS;
21663   return i;
21664 }
21665
21666 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
21667
21668 static bool
21669 aarch64_can_change_mode_class (machine_mode from,
21670                                machine_mode to, reg_class_t)
21671 {
21672   unsigned int from_flags = aarch64_classify_vector_mode (from);
21673   unsigned int to_flags = aarch64_classify_vector_mode (to);
21674
21675   bool from_sve_p = (from_flags & VEC_ANY_SVE);
21676   bool to_sve_p = (to_flags & VEC_ANY_SVE);
21677
21678   bool from_partial_sve_p = from_sve_p && (from_flags & VEC_PARTIAL);
21679   bool to_partial_sve_p = to_sve_p && (to_flags & VEC_PARTIAL);
21680
21681   /* Don't allow changes between partial SVE modes and other modes.
21682      The contents of partial SVE modes are distributed evenly across
21683      the register, whereas GCC expects them to be clustered together.  */
21684   if (from_partial_sve_p != to_partial_sve_p)
21685     return false;
21686
21687   /* Similarly reject changes between partial SVE modes that have
21688      different patterns of significant and insignificant bits.  */
21689   if (from_partial_sve_p
21690       && (aarch64_sve_container_bits (from) != aarch64_sve_container_bits (to)
21691           || GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to)))
21692     return false;
21693
21694   if (BYTES_BIG_ENDIAN)
21695     {
21696       /* Don't allow changes between SVE data modes and non-SVE modes.
21697          See the comment at the head of aarch64-sve.md for details.  */
21698       if (from_sve_p != to_sve_p)
21699         return false;
21700
21701       /* Don't allow changes in element size: lane 0 of the new vector
21702          would not then be lane 0 of the old vector.  See the comment
21703          above aarch64_maybe_expand_sve_subreg_move for a more detailed
21704          description.
21705
21706          In the worst case, this forces a register to be spilled in
21707          one mode and reloaded in the other, which handles the
21708          endianness correctly.  */
21709       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
21710         return false;
21711     }
21712   return true;
21713 }
21714
21715 /* Implement TARGET_EARLY_REMAT_MODES.  */
21716
21717 static void
21718 aarch64_select_early_remat_modes (sbitmap modes)
21719 {
21720   /* SVE values are not normally live across a call, so it should be
21721      worth doing early rematerialization even in VL-specific mode.  */
21722   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
21723     if (aarch64_sve_mode_p ((machine_mode) i))
21724       bitmap_set_bit (modes, i);
21725 }
21726
21727 /* Override the default target speculation_safe_value.  */
21728 static rtx
21729 aarch64_speculation_safe_value (machine_mode mode,
21730                                 rtx result, rtx val, rtx failval)
21731 {
21732   /* Maybe we should warn if falling back to hard barriers.  They are
21733      likely to be noticably more expensive than the alternative below.  */
21734   if (!aarch64_track_speculation)
21735     return default_speculation_safe_value (mode, result, val, failval);
21736
21737   if (!REG_P (val))
21738     val = copy_to_mode_reg (mode, val);
21739
21740   if (!aarch64_reg_or_zero (failval, mode))
21741     failval = copy_to_mode_reg (mode, failval);
21742
21743   emit_insn (gen_despeculate_copy (mode, result, val, failval));
21744   return result;
21745 }
21746
21747 /* Implement TARGET_ESTIMATED_POLY_VALUE.
21748    Look into the tuning structure for an estimate.
21749    VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
21750    Advanced SIMD 128 bits.  */
21751
21752 static HOST_WIDE_INT
21753 aarch64_estimated_poly_value (poly_int64 val)
21754 {
21755   enum aarch64_sve_vector_bits_enum width_source
21756     = aarch64_tune_params.sve_width;
21757
21758   /* If we still don't have an estimate, use the default.  */
21759   if (width_source == SVE_SCALABLE)
21760     return default_estimated_poly_value (val);
21761
21762   HOST_WIDE_INT over_128 = width_source - 128;
21763   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
21764 }
21765
21766
21767 /* Return true for types that could be supported as SIMD return or
21768    argument types.  */
21769
21770 static bool
21771 supported_simd_type (tree t)
21772 {
21773   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
21774     {
21775       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
21776       return s == 1 || s == 2 || s == 4 || s == 8;
21777     }
21778   return false;
21779 }
21780
21781 /* Return true for types that currently are supported as SIMD return
21782    or argument types.  */
21783
21784 static bool
21785 currently_supported_simd_type (tree t, tree b)
21786 {
21787   if (COMPLEX_FLOAT_TYPE_P (t))
21788     return false;
21789
21790   if (TYPE_SIZE (t) != TYPE_SIZE (b))
21791     return false;
21792
21793   return supported_simd_type (t);
21794 }
21795
21796 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
21797
21798 static int
21799 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
21800                                         struct cgraph_simd_clone *clonei,
21801                                         tree base_type, int num)
21802 {
21803   tree t, ret_type, arg_type;
21804   unsigned int elt_bits, vec_bits, count;
21805
21806   if (!TARGET_SIMD)
21807     return 0;
21808
21809   if (clonei->simdlen
21810       && (clonei->simdlen < 2
21811           || clonei->simdlen > 1024
21812           || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
21813     {
21814       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21815                   "unsupported simdlen %d", clonei->simdlen);
21816       return 0;
21817     }
21818
21819   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
21820   if (TREE_CODE (ret_type) != VOID_TYPE
21821       && !currently_supported_simd_type (ret_type, base_type))
21822     {
21823       if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
21824         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21825                     "GCC does not currently support mixed size types "
21826                     "for %<simd%> functions");
21827       else if (supported_simd_type (ret_type))
21828         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21829                     "GCC does not currently support return type %qT "
21830                     "for %<simd%> functions", ret_type);
21831       else
21832         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21833                     "unsupported return type %qT for %<simd%> functions",
21834                     ret_type);
21835       return 0;
21836     }
21837
21838   for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
21839     {
21840       arg_type = TREE_TYPE (t);
21841
21842       if (!currently_supported_simd_type (arg_type, base_type))
21843         {
21844           if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
21845             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21846                         "GCC does not currently support mixed size types "
21847                         "for %<simd%> functions");
21848           else
21849             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21850                         "GCC does not currently support argument type %qT "
21851                         "for %<simd%> functions", arg_type);
21852           return 0;
21853         }
21854     }
21855
21856   clonei->vecsize_mangle = 'n';
21857   clonei->mask_mode = VOIDmode;
21858   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
21859   if (clonei->simdlen == 0)
21860     {
21861       count = 2;
21862       vec_bits = (num == 0 ? 64 : 128);
21863       clonei->simdlen = vec_bits / elt_bits;
21864     }
21865   else
21866     {
21867       count = 1;
21868       vec_bits = clonei->simdlen * elt_bits;
21869       if (vec_bits != 64 && vec_bits != 128)
21870         {
21871           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21872                       "GCC does not currently support simdlen %d for type %qT",
21873                       clonei->simdlen, base_type);
21874           return 0;
21875         }
21876     }
21877   clonei->vecsize_int = vec_bits;
21878   clonei->vecsize_float = vec_bits;
21879   return count;
21880 }
21881
21882 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
21883
21884 static void
21885 aarch64_simd_clone_adjust (struct cgraph_node *node)
21886 {
21887   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
21888      use the correct ABI.  */
21889
21890   tree t = TREE_TYPE (node->decl);
21891   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
21892                                         TYPE_ATTRIBUTES (t));
21893 }
21894
21895 /* Implement TARGET_SIMD_CLONE_USABLE.  */
21896
21897 static int
21898 aarch64_simd_clone_usable (struct cgraph_node *node)
21899 {
21900   switch (node->simdclone->vecsize_mangle)
21901     {
21902     case 'n':
21903       if (!TARGET_SIMD)
21904         return -1;
21905       return 0;
21906     default:
21907       gcc_unreachable ();
21908     }
21909 }
21910
21911 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
21912
21913 static int
21914 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
21915 {
21916   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
21917       != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
21918     return 0;
21919   return 1;
21920 }
21921
21922 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
21923
21924 static const char *
21925 aarch64_get_multilib_abi_name (void)
21926 {
21927   if (TARGET_BIG_END)
21928     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
21929   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
21930 }
21931
21932 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
21933    global variable based guard use the default else
21934    return a null tree.  */
21935 static tree
21936 aarch64_stack_protect_guard (void)
21937 {
21938   if (aarch64_stack_protector_guard == SSP_GLOBAL)
21939     return default_stack_protect_guard ();
21940
21941   return NULL_TREE;
21942 }
21943
21944 /* Return the diagnostic message string if conversion from FROMTYPE to
21945    TOTYPE is not allowed, NULL otherwise.  */
21946
21947 static const char *
21948 aarch64_invalid_conversion (const_tree fromtype, const_tree totype)
21949 {
21950   if (element_mode (fromtype) != element_mode (totype))
21951     {
21952       /* Do no allow conversions to/from BFmode scalar types.  */
21953       if (TYPE_MODE (fromtype) == BFmode)
21954         return N_("invalid conversion from type %<bfloat16_t%>");
21955       if (TYPE_MODE (totype) == BFmode)
21956         return N_("invalid conversion to type %<bfloat16_t%>");
21957     }
21958
21959   /* Conversion allowed.  */
21960   return NULL;
21961 }
21962
21963 /* Return the diagnostic message string if the unary operation OP is
21964    not permitted on TYPE, NULL otherwise.  */
21965
21966 static const char *
21967 aarch64_invalid_unary_op (int op, const_tree type)
21968 {
21969   /* Reject all single-operand operations on BFmode except for &.  */
21970   if (element_mode (type) == BFmode && op != ADDR_EXPR)
21971     return N_("operation not permitted on type %<bfloat16_t%>");
21972
21973   /* Operation allowed.  */
21974   return NULL;
21975 }
21976
21977 /* Return the diagnostic message string if the binary operation OP is
21978    not permitted on TYPE1 and TYPE2, NULL otherwise.  */
21979
21980 static const char *
21981 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
21982                            const_tree type2)
21983 {
21984   /* Reject all 2-operand operations on BFmode.  */
21985   if (element_mode (type1) == BFmode
21986       || element_mode (type2) == BFmode)
21987     return N_("operation not permitted on type %<bfloat16_t%>");
21988
21989   /* Operation allowed.  */
21990   return NULL;
21991 }
21992
21993 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
21994    section at the end if needed.  */
21995 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND      0xc0000000
21996 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI      (1U << 0)
21997 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC      (1U << 1)
21998 void
21999 aarch64_file_end_indicate_exec_stack ()
22000 {
22001   file_end_indicate_exec_stack ();
22002
22003   unsigned feature_1_and = 0;
22004   if (aarch64_bti_enabled ())
22005     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
22006
22007   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
22008     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
22009
22010   if (feature_1_and)
22011     {
22012       /* Generate .note.gnu.property section.  */
22013       switch_to_section (get_section (".note.gnu.property",
22014                                       SECTION_NOTYPE, NULL));
22015
22016       /* PT_NOTE header: namesz, descsz, type.
22017          namesz = 4 ("GNU\0")
22018          descsz = 16 (Size of the program property array)
22019                   [(12 + padding) * Number of array elements]
22020          type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
22021       assemble_align (POINTER_SIZE);
22022       assemble_integer (GEN_INT (4), 4, 32, 1);
22023       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
22024       assemble_integer (GEN_INT (5), 4, 32, 1);
22025
22026       /* PT_NOTE name.  */
22027       assemble_string ("GNU", 4);
22028
22029       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
22030          type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
22031          datasz = 4
22032          data   = feature_1_and.  */
22033       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
22034       assemble_integer (GEN_INT (4), 4, 32, 1);
22035       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
22036
22037       /* Pad the size of the note to the required alignment.  */
22038       assemble_align (POINTER_SIZE);
22039     }
22040 }
22041 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
22042 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
22043 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
22044
22045 /* Target-specific selftests.  */
22046
22047 #if CHECKING_P
22048
22049 namespace selftest {
22050
22051 /* Selftest for the RTL loader.
22052    Verify that the RTL loader copes with a dump from
22053    print_rtx_function.  This is essentially just a test that class
22054    function_reader can handle a real dump, but it also verifies
22055    that lookup_reg_by_dump_name correctly handles hard regs.
22056    The presence of hard reg names in the dump means that the test is
22057    target-specific, hence it is in this file.  */
22058
22059 static void
22060 aarch64_test_loading_full_dump ()
22061 {
22062   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
22063
22064   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
22065
22066   rtx_insn *insn_1 = get_insn_by_uid (1);
22067   ASSERT_EQ (NOTE, GET_CODE (insn_1));
22068
22069   rtx_insn *insn_15 = get_insn_by_uid (15);
22070   ASSERT_EQ (INSN, GET_CODE (insn_15));
22071   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
22072
22073   /* Verify crtl->return_rtx.  */
22074   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
22075   ASSERT_EQ (0, REGNO (crtl->return_rtx));
22076   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
22077 }
22078
22079 /* Run all target-specific selftests.  */
22080
22081 static void
22082 aarch64_run_selftests (void)
22083 {
22084   aarch64_test_loading_full_dump ();
22085 }
22086
22087 } // namespace selftest
22088
22089 #endif /* #if CHECKING_P */
22090
22091 #undef TARGET_STACK_PROTECT_GUARD
22092 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
22093
22094 #undef TARGET_ADDRESS_COST
22095 #define TARGET_ADDRESS_COST aarch64_address_cost
22096
22097 /* This hook will determines whether unnamed bitfields affect the alignment
22098    of the containing structure.  The hook returns true if the structure
22099    should inherit the alignment requirements of an unnamed bitfield's
22100    type.  */
22101 #undef TARGET_ALIGN_ANON_BITFIELD
22102 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
22103
22104 #undef TARGET_ASM_ALIGNED_DI_OP
22105 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
22106
22107 #undef TARGET_ASM_ALIGNED_HI_OP
22108 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
22109
22110 #undef TARGET_ASM_ALIGNED_SI_OP
22111 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
22112
22113 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
22114 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
22115   hook_bool_const_tree_hwi_hwi_const_tree_true
22116
22117 #undef TARGET_ASM_FILE_START
22118 #define TARGET_ASM_FILE_START aarch64_start_file
22119
22120 #undef TARGET_ASM_OUTPUT_MI_THUNK
22121 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
22122
22123 #undef TARGET_ASM_SELECT_RTX_SECTION
22124 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
22125
22126 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
22127 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
22128
22129 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
22130 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
22131
22132 #undef TARGET_BUILD_BUILTIN_VA_LIST
22133 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
22134
22135 #undef TARGET_CALLEE_COPIES
22136 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
22137
22138 #undef TARGET_CAN_ELIMINATE
22139 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
22140
22141 #undef TARGET_CAN_INLINE_P
22142 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
22143
22144 #undef TARGET_CANNOT_FORCE_CONST_MEM
22145 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
22146
22147 #undef TARGET_CASE_VALUES_THRESHOLD
22148 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
22149
22150 #undef TARGET_CONDITIONAL_REGISTER_USAGE
22151 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
22152
22153 /* Only the least significant bit is used for initialization guard
22154    variables.  */
22155 #undef TARGET_CXX_GUARD_MASK_BIT
22156 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
22157
22158 #undef TARGET_C_MODE_FOR_SUFFIX
22159 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
22160
22161 #ifdef TARGET_BIG_ENDIAN_DEFAULT
22162 #undef  TARGET_DEFAULT_TARGET_FLAGS
22163 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
22164 #endif
22165
22166 #undef TARGET_CLASS_MAX_NREGS
22167 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
22168
22169 #undef TARGET_BUILTIN_DECL
22170 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
22171
22172 #undef TARGET_BUILTIN_RECIPROCAL
22173 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
22174
22175 #undef TARGET_C_EXCESS_PRECISION
22176 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
22177
22178 #undef  TARGET_EXPAND_BUILTIN
22179 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
22180
22181 #undef TARGET_EXPAND_BUILTIN_VA_START
22182 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
22183
22184 #undef TARGET_FOLD_BUILTIN
22185 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
22186
22187 #undef TARGET_FUNCTION_ARG
22188 #define TARGET_FUNCTION_ARG aarch64_function_arg
22189
22190 #undef TARGET_FUNCTION_ARG_ADVANCE
22191 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
22192
22193 #undef TARGET_FUNCTION_ARG_BOUNDARY
22194 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
22195
22196 #undef TARGET_FUNCTION_ARG_PADDING
22197 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
22198
22199 #undef TARGET_GET_RAW_RESULT_MODE
22200 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
22201 #undef TARGET_GET_RAW_ARG_MODE
22202 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
22203
22204 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
22205 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
22206
22207 #undef TARGET_FUNCTION_VALUE
22208 #define TARGET_FUNCTION_VALUE aarch64_function_value
22209
22210 #undef TARGET_FUNCTION_VALUE_REGNO_P
22211 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
22212
22213 #undef TARGET_GIMPLE_FOLD_BUILTIN
22214 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
22215
22216 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
22217 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
22218
22219 #undef  TARGET_INIT_BUILTINS
22220 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
22221
22222 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
22223 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
22224   aarch64_ira_change_pseudo_allocno_class
22225
22226 #undef TARGET_LEGITIMATE_ADDRESS_P
22227 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
22228
22229 #undef TARGET_LEGITIMATE_CONSTANT_P
22230 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
22231
22232 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
22233 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
22234   aarch64_legitimize_address_displacement
22235
22236 #undef TARGET_LIBGCC_CMP_RETURN_MODE
22237 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
22238
22239 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
22240 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
22241 aarch64_libgcc_floating_mode_supported_p
22242
22243 #undef TARGET_MANGLE_TYPE
22244 #define TARGET_MANGLE_TYPE aarch64_mangle_type
22245
22246 #undef TARGET_INVALID_CONVERSION
22247 #define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
22248
22249 #undef TARGET_INVALID_UNARY_OP
22250 #define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
22251
22252 #undef TARGET_INVALID_BINARY_OP
22253 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
22254
22255 #undef TARGET_VERIFY_TYPE_CONTEXT
22256 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
22257
22258 #undef TARGET_MEMORY_MOVE_COST
22259 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
22260
22261 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
22262 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
22263
22264 #undef TARGET_MUST_PASS_IN_STACK
22265 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
22266
22267 /* This target hook should return true if accesses to volatile bitfields
22268    should use the narrowest mode possible.  It should return false if these
22269    accesses should use the bitfield container type.  */
22270 #undef TARGET_NARROW_VOLATILE_BITFIELD
22271 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
22272
22273 #undef  TARGET_OPTION_OVERRIDE
22274 #define TARGET_OPTION_OVERRIDE aarch64_override_options
22275
22276 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
22277 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
22278   aarch64_override_options_after_change
22279
22280 #undef TARGET_OPTION_SAVE
22281 #define TARGET_OPTION_SAVE aarch64_option_save
22282
22283 #undef TARGET_OPTION_RESTORE
22284 #define TARGET_OPTION_RESTORE aarch64_option_restore
22285
22286 #undef TARGET_OPTION_PRINT
22287 #define TARGET_OPTION_PRINT aarch64_option_print
22288
22289 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
22290 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
22291
22292 #undef TARGET_SET_CURRENT_FUNCTION
22293 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
22294
22295 #undef TARGET_PASS_BY_REFERENCE
22296 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
22297
22298 #undef TARGET_PREFERRED_RELOAD_CLASS
22299 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
22300
22301 #undef TARGET_SCHED_REASSOCIATION_WIDTH
22302 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
22303
22304 #undef TARGET_PROMOTED_TYPE
22305 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
22306
22307 #undef TARGET_SECONDARY_RELOAD
22308 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
22309
22310 #undef TARGET_SHIFT_TRUNCATION_MASK
22311 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
22312
22313 #undef TARGET_SETUP_INCOMING_VARARGS
22314 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
22315
22316 #undef TARGET_STRUCT_VALUE_RTX
22317 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
22318
22319 #undef TARGET_REGISTER_MOVE_COST
22320 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
22321
22322 #undef TARGET_RETURN_IN_MEMORY
22323 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
22324
22325 #undef TARGET_RETURN_IN_MSB
22326 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
22327
22328 #undef TARGET_RTX_COSTS
22329 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
22330
22331 #undef TARGET_SCALAR_MODE_SUPPORTED_P
22332 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
22333
22334 #undef TARGET_SCHED_ISSUE_RATE
22335 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
22336
22337 #undef TARGET_SCHED_VARIABLE_ISSUE
22338 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
22339
22340 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
22341 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
22342   aarch64_sched_first_cycle_multipass_dfa_lookahead
22343
22344 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
22345 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
22346   aarch64_first_cycle_multipass_dfa_lookahead_guard
22347
22348 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
22349 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
22350   aarch64_get_separate_components
22351
22352 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
22353 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
22354   aarch64_components_for_bb
22355
22356 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
22357 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
22358   aarch64_disqualify_components
22359
22360 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
22361 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
22362   aarch64_emit_prologue_components
22363
22364 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
22365 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
22366   aarch64_emit_epilogue_components
22367
22368 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
22369 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
22370   aarch64_set_handled_components
22371
22372 #undef TARGET_TRAMPOLINE_INIT
22373 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
22374
22375 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
22376 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
22377
22378 #undef TARGET_VECTOR_MODE_SUPPORTED_P
22379 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
22380
22381 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
22382 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
22383
22384 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
22385 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
22386   aarch64_builtin_support_vector_misalignment
22387
22388 #undef TARGET_ARRAY_MODE
22389 #define TARGET_ARRAY_MODE aarch64_array_mode
22390
22391 #undef TARGET_ARRAY_MODE_SUPPORTED_P
22392 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
22393
22394 #undef TARGET_VECTORIZE_ADD_STMT_COST
22395 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
22396
22397 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
22398 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
22399   aarch64_builtin_vectorization_cost
22400
22401 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
22402 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
22403
22404 #undef TARGET_VECTORIZE_BUILTINS
22405 #define TARGET_VECTORIZE_BUILTINS
22406
22407 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
22408 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
22409   aarch64_builtin_vectorized_function
22410
22411 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
22412 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
22413   aarch64_autovectorize_vector_modes
22414
22415 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
22416 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
22417   aarch64_atomic_assign_expand_fenv
22418
22419 /* Section anchor support.  */
22420
22421 #undef TARGET_MIN_ANCHOR_OFFSET
22422 #define TARGET_MIN_ANCHOR_OFFSET -256
22423
22424 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
22425    byte offset; we can do much more for larger data types, but have no way
22426    to determine the size of the access.  We assume accesses are aligned.  */
22427 #undef TARGET_MAX_ANCHOR_OFFSET
22428 #define TARGET_MAX_ANCHOR_OFFSET 4095
22429
22430 #undef TARGET_VECTOR_ALIGNMENT
22431 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
22432
22433 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
22434 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
22435   aarch64_vectorize_preferred_vector_alignment
22436 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
22437 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
22438   aarch64_simd_vector_alignment_reachable
22439
22440 /* vec_perm support.  */
22441
22442 #undef TARGET_VECTORIZE_VEC_PERM_CONST
22443 #define TARGET_VECTORIZE_VEC_PERM_CONST \
22444   aarch64_vectorize_vec_perm_const
22445
22446 #undef TARGET_VECTORIZE_RELATED_MODE
22447 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
22448 #undef TARGET_VECTORIZE_GET_MASK_MODE
22449 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
22450 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
22451 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
22452   aarch64_empty_mask_is_expensive
22453 #undef TARGET_PREFERRED_ELSE_VALUE
22454 #define TARGET_PREFERRED_ELSE_VALUE \
22455   aarch64_preferred_else_value
22456
22457 #undef TARGET_INIT_LIBFUNCS
22458 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
22459
22460 #undef TARGET_FIXED_CONDITION_CODE_REGS
22461 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
22462
22463 #undef TARGET_FLAGS_REGNUM
22464 #define TARGET_FLAGS_REGNUM CC_REGNUM
22465
22466 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
22467 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
22468
22469 #undef TARGET_ASAN_SHADOW_OFFSET
22470 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
22471
22472 #undef TARGET_LEGITIMIZE_ADDRESS
22473 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
22474
22475 #undef TARGET_SCHED_CAN_SPECULATE_INSN
22476 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
22477
22478 #undef TARGET_CAN_USE_DOLOOP_P
22479 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
22480
22481 #undef TARGET_SCHED_ADJUST_PRIORITY
22482 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
22483
22484 #undef TARGET_SCHED_MACRO_FUSION_P
22485 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
22486
22487 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
22488 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
22489
22490 #undef TARGET_SCHED_FUSION_PRIORITY
22491 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
22492
22493 #undef TARGET_UNSPEC_MAY_TRAP_P
22494 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
22495
22496 #undef TARGET_USE_PSEUDO_PIC_REG
22497 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
22498
22499 #undef TARGET_PRINT_OPERAND
22500 #define TARGET_PRINT_OPERAND aarch64_print_operand
22501
22502 #undef TARGET_PRINT_OPERAND_ADDRESS
22503 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
22504
22505 #undef TARGET_OPTAB_SUPPORTED_P
22506 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
22507
22508 #undef TARGET_OMIT_STRUCT_RETURN_REG
22509 #define TARGET_OMIT_STRUCT_RETURN_REG true
22510
22511 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
22512 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
22513   aarch64_dwarf_poly_indeterminate_value
22514
22515 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
22516 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
22517 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
22518
22519 #undef TARGET_HARD_REGNO_NREGS
22520 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
22521 #undef TARGET_HARD_REGNO_MODE_OK
22522 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
22523
22524 #undef TARGET_MODES_TIEABLE_P
22525 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
22526
22527 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
22528 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
22529   aarch64_hard_regno_call_part_clobbered
22530
22531 #undef TARGET_INSN_CALLEE_ABI
22532 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
22533
22534 #undef TARGET_CONSTANT_ALIGNMENT
22535 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
22536
22537 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
22538 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
22539   aarch64_stack_clash_protection_alloca_probe_range
22540
22541 #undef TARGET_COMPUTE_PRESSURE_CLASSES
22542 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
22543
22544 #undef TARGET_CAN_CHANGE_MODE_CLASS
22545 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
22546
22547 #undef TARGET_SELECT_EARLY_REMAT_MODES
22548 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
22549
22550 #undef TARGET_SPECULATION_SAFE_VALUE
22551 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
22552
22553 #undef TARGET_ESTIMATED_POLY_VALUE
22554 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
22555
22556 #undef TARGET_ATTRIBUTE_TABLE
22557 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
22558
22559 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
22560 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
22561   aarch64_simd_clone_compute_vecsize_and_simdlen
22562
22563 #undef TARGET_SIMD_CLONE_ADJUST
22564 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
22565
22566 #undef TARGET_SIMD_CLONE_USABLE
22567 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
22568
22569 #undef TARGET_COMP_TYPE_ATTRIBUTES
22570 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
22571
22572 #undef TARGET_GET_MULTILIB_ABI_NAME
22573 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
22574
22575 #undef TARGET_FNTYPE_ABI
22576 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
22577
22578 #if CHECKING_P
22579 #undef TARGET_RUN_TARGET_SELFTESTS
22580 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
22581 #endif /* #if CHECKING_P */
22582
22583 #undef TARGET_ASM_POST_CFI_STARTPROC
22584 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
22585
22586 #undef TARGET_STRICT_ARGUMENT_NAMING
22587 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
22588
22589 #undef TARGET_MD_ASM_ADJUST
22590 #define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
22591
22592 struct gcc_target targetm = TARGET_INITIALIZER;
22593
22594 #include "gt-aarch64.h"