gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2019 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "cgraph.h"
  44 #include "diagnostic.h"
  45 #include "insn-attr.h"
  46 #include "alias.h"
  47 #include "fold-const.h"
  48 #include "stor-layout.h"
  49 #include "calls.h"
  50 #include "varasm.h"
  51 #include "output.h"
  52 #include "flags.h"
  53 #include "explow.h"
  54 #include "expr.h"
  55 #include "reload.h"
  56 #include "langhooks.h"
  57 #include "opts.h"
  58 #include "params.h"
  59 #include "gimplify.h"
  60 #include "dwarf2.h"
  61 #include "gimple-iterator.h"
  62 #include "tree-vectorizer.h"
  63 #include "aarch64-cost-tables.h"
  64 #include "dumpfile.h"
  65 #include "builtins.h"
  66 #include "rtl-iter.h"
  67 #include "tm-constrs.h"
  68 #include "sched-int.h"
  69 #include "target-globals.h"
  70 #include "common/common-target.h"
  71 #include "cfgrtl.h"
  72 #include "selftest.h"
  73 #include "selftest-rtl.h"
  74 #include "rtx-vector-builder.h"
  75 #include "intl.h"
  76 #include "expmed.h"
  77 #include "function-abi.h"
  78
  79 /* This file should be included last.  */
  80 #include "target-def.h"
  81
  82 /* Defined for convenience.  */
  83 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  84
  85 /* Information about a legitimate vector immediate operand.  */
  86 struct simd_immediate_info
  87 {
  88   enum insn_type { MOV, MVN, INDEX, PTRUE };
  89   enum modifier_type { LSL, MSL };
  90
  91   simd_immediate_info () {}
  92   simd_immediate_info (scalar_float_mode, rtx);
  93   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
  94                        insn_type = MOV, modifier_type = LSL,
  95                        unsigned int = 0);
  96   simd_immediate_info (scalar_mode, rtx, rtx);
  97   simd_immediate_info (scalar_int_mode, aarch64_svpattern);
  98
  99   /* The mode of the elements.  */
 100   scalar_mode elt_mode;
 101
 102   /* The instruction to use to move the immediate into a vector.  */
 103   insn_type insn;
 104
 105   union
 106   {
 107     /* For MOV and MVN.  */
 108     struct
 109     {
 110       /* The value of each element.  */
 111       rtx value;
 112
 113       /* The kind of shift modifier to use, and the number of bits to shift.
 114          This is (LSL, 0) if no shift is needed.  */
 115       modifier_type modifier;
 116       unsigned int shift;
 117     } mov;
 118
 119     /* For INDEX.  */
 120     struct
 121     {
 122       /* The value of the first element and the step to be added for each
 123          subsequent element.  */
 124       rtx base, step;
 125     } index;
 126
 127     /* For PTRUE.  */
 128     aarch64_svpattern pattern;
 129   } u;
 130 };
 131
 132 /* Construct a floating-point immediate in which each element has mode
 133    ELT_MODE_IN and value VALUE_IN.  */
 134 inline simd_immediate_info
 135 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 136   : elt_mode (elt_mode_in), insn (MOV)
 137 {
 138   u.mov.value = value_in;
 139   u.mov.modifier = LSL;
 140   u.mov.shift = 0;
 141 }
 142
 143 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 144    and value VALUE_IN.  The other parameters are as for the structure
 145    fields.  */
 146 inline simd_immediate_info
 147 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 148                        unsigned HOST_WIDE_INT value_in,
 149                        insn_type insn_in, modifier_type modifier_in,
 150                        unsigned int shift_in)
 151   : elt_mode (elt_mode_in), insn (insn_in)
 152 {
 153   u.mov.value = gen_int_mode (value_in, elt_mode_in);
 154   u.mov.modifier = modifier_in;
 155   u.mov.shift = shift_in;
 156 }
 157
 158 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 159    and where element I is equal to BASE_IN + I * STEP_IN.  */
 160 inline simd_immediate_info
 161 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
 162   : elt_mode (elt_mode_in), insn (INDEX)
 163 {
 164   u.index.base = base_in;
 165   u.index.step = step_in;
 166 }
 167
 168 /* Construct a predicate that controls elements of mode ELT_MODE_IN
 169    and has PTRUE pattern PATTERN_IN.  */
 170 inline simd_immediate_info
 171 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 172                        aarch64_svpattern pattern_in)
 173   : elt_mode (elt_mode_in), insn (PTRUE)
 174 {
 175   u.pattern = pattern_in;
 176 }
 177
 178 /* The current code model.  */
 179 enum aarch64_code_model aarch64_cmodel;
 180
 181 /* The number of 64-bit elements in an SVE vector.  */
 182 poly_uint16 aarch64_sve_vg;
 183
 184 #ifdef HAVE_AS_TLS
 185 #undef TARGET_HAVE_TLS
 186 #define TARGET_HAVE_TLS 1
 187 #endif
 188
 189 static bool aarch64_composite_type_p (const_tree, machine_mode);
 190 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 191                                                      const_tree,
 192                                                      machine_mode *, int *,
 193                                                      bool *);
 194 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 195 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 196 static void aarch64_override_options_after_change (void);
 197 static bool aarch64_vector_mode_supported_p (machine_mode);
 198 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 199 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 200                                                          const_tree type,
 201                                                          int misalignment,
 202                                                          bool is_packed);
 203 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 204 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 205                                             aarch64_addr_query_type);
 206 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
 207
 208 /* Major revision number of the ARM Architecture implemented by the target.  */
 209 unsigned aarch64_architecture_version;
 210
 211 /* The processor for which instructions should be scheduled.  */
 212 enum aarch64_processor aarch64_tune = cortexa53;
 213
 214 /* Mask to specify which instruction scheduling options should be used.  */
 215 uint64_t aarch64_tune_flags = 0;
 216
 217 /* Global flag for PC relative loads.  */
 218 bool aarch64_pcrelative_literal_loads;
 219
 220 /* Global flag for whether frame pointer is enabled.  */
 221 bool aarch64_use_frame_pointer;
 222
 223 #define BRANCH_PROTECT_STR_MAX 255
 224 char *accepted_branch_protection_string = NULL;
 225
 226 static enum aarch64_parse_opt_result
 227 aarch64_parse_branch_protection (const char*, char**);
 228
 229 /* Support for command line parsing of boolean flags in the tuning
 230    structures.  */
 231 struct aarch64_flag_desc
 232 {
 233   const char* name;
 234   unsigned int flag;
 235 };
 236
 237 #define AARCH64_FUSION_PAIR(name, internal_name) \
 238   { name, AARCH64_FUSE_##internal_name },
 239 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 240 {
 241   { "none", AARCH64_FUSE_NOTHING },
 242 #include "aarch64-fusion-pairs.def"
 243   { "all", AARCH64_FUSE_ALL },
 244   { NULL, AARCH64_FUSE_NOTHING }
 245 };
 246
 247 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 248   { name, AARCH64_EXTRA_TUNE_##internal_name },
 249 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 250 {
 251   { "none", AARCH64_EXTRA_TUNE_NONE },
 252 #include "aarch64-tuning-flags.def"
 253   { "all", AARCH64_EXTRA_TUNE_ALL },
 254   { NULL, AARCH64_EXTRA_TUNE_NONE }
 255 };
 256
 257 /* Tuning parameters.  */
 258
 259 static const struct cpu_addrcost_table generic_addrcost_table =
 260 {
 261     {
 262       1, /* hi  */
 263       0, /* si  */
 264       0, /* di  */
 265       1, /* ti  */
 266     },
 267   0, /* pre_modify  */
 268   0, /* post_modify  */
 269   0, /* register_offset  */
 270   0, /* register_sextend  */
 271   0, /* register_zextend  */
 272   0 /* imm_offset  */
 273 };
 274
 275 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 276 {
 277     {
 278       0, /* hi  */
 279       0, /* si  */
 280       0, /* di  */
 281       2, /* ti  */
 282     },
 283   0, /* pre_modify  */
 284   0, /* post_modify  */
 285   1, /* register_offset  */
 286   1, /* register_sextend  */
 287   2, /* register_zextend  */
 288   0, /* imm_offset  */
 289 };
 290
 291 static const struct cpu_addrcost_table xgene1_addrcost_table =
 292 {
 293     {
 294       1, /* hi  */
 295       0, /* si  */
 296       0, /* di  */
 297       1, /* ti  */
 298     },
 299   1, /* pre_modify  */
 300   1, /* post_modify  */
 301   0, /* register_offset  */
 302   1, /* register_sextend  */
 303   1, /* register_zextend  */
 304   0, /* imm_offset  */
 305 };
 306
 307 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 308 {
 309     {
 310       1, /* hi  */
 311       1, /* si  */
 312       1, /* di  */
 313       2, /* ti  */
 314     },
 315   0, /* pre_modify  */
 316   0, /* post_modify  */
 317   2, /* register_offset  */
 318   3, /* register_sextend  */
 319   3, /* register_zextend  */
 320   0, /* imm_offset  */
 321 };
 322
 323 static const struct cpu_addrcost_table tsv110_addrcost_table =
 324 {
 325     {
 326       1, /* hi  */
 327       0, /* si  */
 328       0, /* di  */
 329       1, /* ti  */
 330     },
 331   0, /* pre_modify  */
 332   0, /* post_modify  */
 333   0, /* register_offset  */
 334   1, /* register_sextend  */
 335   1, /* register_zextend  */
 336   0, /* imm_offset  */
 337 };
 338
 339 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 340 {
 341     {
 342       1, /* hi  */
 343       1, /* si  */
 344       1, /* di  */
 345       2, /* ti  */
 346     },
 347   1, /* pre_modify  */
 348   1, /* post_modify  */
 349   3, /* register_offset  */
 350   3, /* register_sextend  */
 351   3, /* register_zextend  */
 352   2, /* imm_offset  */
 353 };
 354
 355 static const struct cpu_regmove_cost generic_regmove_cost =
 356 {
 357   1, /* GP2GP  */
 358   /* Avoid the use of slow int<->fp moves for spilling by setting
 359      their cost higher than memmov_cost.  */
 360   5, /* GP2FP  */
 361   5, /* FP2GP  */
 362   2 /* FP2FP  */
 363 };
 364
 365 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 366 {
 367   1, /* GP2GP  */
 368   /* Avoid the use of slow int<->fp moves for spilling by setting
 369      their cost higher than memmov_cost.  */
 370   5, /* GP2FP  */
 371   5, /* FP2GP  */
 372   2 /* FP2FP  */
 373 };
 374
 375 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 376 {
 377   1, /* GP2GP  */
 378   /* Avoid the use of slow int<->fp moves for spilling by setting
 379      their cost higher than memmov_cost.  */
 380   5, /* GP2FP  */
 381   5, /* FP2GP  */
 382   2 /* FP2FP  */
 383 };
 384
 385 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 386 {
 387   1, /* GP2GP  */
 388   /* Avoid the use of slow int<->fp moves for spilling by setting
 389      their cost higher than memmov_cost (actual, 4 and 9).  */
 390   9, /* GP2FP  */
 391   9, /* FP2GP  */
 392   1 /* FP2FP  */
 393 };
 394
 395 static const struct cpu_regmove_cost thunderx_regmove_cost =
 396 {
 397   2, /* GP2GP  */
 398   2, /* GP2FP  */
 399   6, /* FP2GP  */
 400   4 /* FP2FP  */
 401 };
 402
 403 static const struct cpu_regmove_cost xgene1_regmove_cost =
 404 {
 405   1, /* GP2GP  */
 406   /* Avoid the use of slow int<->fp moves for spilling by setting
 407      their cost higher than memmov_cost.  */
 408   8, /* GP2FP  */
 409   8, /* FP2GP  */
 410   2 /* FP2FP  */
 411 };
 412
 413 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 414 {
 415   2, /* GP2GP  */
 416   /* Avoid the use of int<->fp moves for spilling.  */
 417   6, /* GP2FP  */
 418   6, /* FP2GP  */
 419   4 /* FP2FP  */
 420 };
 421
 422 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 423 {
 424   1, /* GP2GP  */
 425   /* Avoid the use of int<->fp moves for spilling.  */
 426   8, /* GP2FP  */
 427   8, /* FP2GP  */
 428   4  /* FP2FP  */
 429 };
 430
 431 static const struct cpu_regmove_cost tsv110_regmove_cost =
 432 {
 433   1, /* GP2GP  */
 434   /* Avoid the use of slow int<->fp moves for spilling by setting
 435      their cost higher than memmov_cost.  */
 436   2, /* GP2FP  */
 437   3, /* FP2GP  */
 438   2  /* FP2FP  */
 439 };
 440
 441 /* Generic costs for vector insn classes.  */
 442 static const struct cpu_vector_cost generic_vector_cost =
 443 {
 444   1, /* scalar_int_stmt_cost  */
 445   1, /* scalar_fp_stmt_cost  */
 446   1, /* scalar_load_cost  */
 447   1, /* scalar_store_cost  */
 448   1, /* vec_int_stmt_cost  */
 449   1, /* vec_fp_stmt_cost  */
 450   2, /* vec_permute_cost  */
 451   1, /* vec_to_scalar_cost  */
 452   1, /* scalar_to_vec_cost  */
 453   1, /* vec_align_load_cost  */
 454   1, /* vec_unalign_load_cost  */
 455   1, /* vec_unalign_store_cost  */
 456   1, /* vec_store_cost  */
 457   3, /* cond_taken_branch_cost  */
 458   1 /* cond_not_taken_branch_cost  */
 459 };
 460
 461 /* QDF24XX costs for vector insn classes.  */
 462 static const struct cpu_vector_cost qdf24xx_vector_cost =
 463 {
 464   1, /* scalar_int_stmt_cost  */
 465   1, /* scalar_fp_stmt_cost  */
 466   1, /* scalar_load_cost  */
 467   1, /* scalar_store_cost  */
 468   1, /* vec_int_stmt_cost  */
 469   3, /* vec_fp_stmt_cost  */
 470   2, /* vec_permute_cost  */
 471   1, /* vec_to_scalar_cost  */
 472   1, /* scalar_to_vec_cost  */
 473   1, /* vec_align_load_cost  */
 474   1, /* vec_unalign_load_cost  */
 475   1, /* vec_unalign_store_cost  */
 476   1, /* vec_store_cost  */
 477   3, /* cond_taken_branch_cost  */
 478   1 /* cond_not_taken_branch_cost  */
 479 };
 480
 481 /* ThunderX costs for vector insn classes.  */
 482 static const struct cpu_vector_cost thunderx_vector_cost =
 483 {
 484   1, /* scalar_int_stmt_cost  */
 485   1, /* scalar_fp_stmt_cost  */
 486   3, /* scalar_load_cost  */
 487   1, /* scalar_store_cost  */
 488   4, /* vec_int_stmt_cost  */
 489   1, /* vec_fp_stmt_cost  */
 490   4, /* vec_permute_cost  */
 491   2, /* vec_to_scalar_cost  */
 492   2, /* scalar_to_vec_cost  */
 493   3, /* vec_align_load_cost  */
 494   5, /* vec_unalign_load_cost  */
 495   5, /* vec_unalign_store_cost  */
 496   1, /* vec_store_cost  */
 497   3, /* cond_taken_branch_cost  */
 498   3 /* cond_not_taken_branch_cost  */
 499 };
 500
 501 static const struct cpu_vector_cost tsv110_vector_cost =
 502 {
 503   1, /* scalar_int_stmt_cost  */
 504   1, /* scalar_fp_stmt_cost  */
 505   5, /* scalar_load_cost  */
 506   1, /* scalar_store_cost  */
 507   2, /* vec_int_stmt_cost  */
 508   2, /* vec_fp_stmt_cost  */
 509   2, /* vec_permute_cost  */
 510   3, /* vec_to_scalar_cost  */
 511   2, /* scalar_to_vec_cost  */
 512   5, /* vec_align_load_cost  */
 513   5, /* vec_unalign_load_cost  */
 514   1, /* vec_unalign_store_cost  */
 515   1, /* vec_store_cost  */
 516   1, /* cond_taken_branch_cost  */
 517   1 /* cond_not_taken_branch_cost  */
 518 };
 519
 520 /* Generic costs for vector insn classes.  */
 521 static const struct cpu_vector_cost cortexa57_vector_cost =
 522 {
 523   1, /* scalar_int_stmt_cost  */
 524   1, /* scalar_fp_stmt_cost  */
 525   4, /* scalar_load_cost  */
 526   1, /* scalar_store_cost  */
 527   2, /* vec_int_stmt_cost  */
 528   2, /* vec_fp_stmt_cost  */
 529   3, /* vec_permute_cost  */
 530   8, /* vec_to_scalar_cost  */
 531   8, /* scalar_to_vec_cost  */
 532   4, /* vec_align_load_cost  */
 533   4, /* vec_unalign_load_cost  */
 534   1, /* vec_unalign_store_cost  */
 535   1, /* vec_store_cost  */
 536   1, /* cond_taken_branch_cost  */
 537   1 /* cond_not_taken_branch_cost  */
 538 };
 539
 540 static const struct cpu_vector_cost exynosm1_vector_cost =
 541 {
 542   1, /* scalar_int_stmt_cost  */
 543   1, /* scalar_fp_stmt_cost  */
 544   5, /* scalar_load_cost  */
 545   1, /* scalar_store_cost  */
 546   3, /* vec_int_stmt_cost  */
 547   3, /* vec_fp_stmt_cost  */
 548   3, /* vec_permute_cost  */
 549   3, /* vec_to_scalar_cost  */
 550   3, /* scalar_to_vec_cost  */
 551   5, /* vec_align_load_cost  */
 552   5, /* vec_unalign_load_cost  */
 553   1, /* vec_unalign_store_cost  */
 554   1, /* vec_store_cost  */
 555   1, /* cond_taken_branch_cost  */
 556   1 /* cond_not_taken_branch_cost  */
 557 };
 558
 559 /* Generic costs for vector insn classes.  */
 560 static const struct cpu_vector_cost xgene1_vector_cost =
 561 {
 562   1, /* scalar_int_stmt_cost  */
 563   1, /* scalar_fp_stmt_cost  */
 564   5, /* scalar_load_cost  */
 565   1, /* scalar_store_cost  */
 566   2, /* vec_int_stmt_cost  */
 567   2, /* vec_fp_stmt_cost  */
 568   2, /* vec_permute_cost  */
 569   4, /* vec_to_scalar_cost  */
 570   4, /* scalar_to_vec_cost  */
 571   10, /* vec_align_load_cost  */
 572   10, /* vec_unalign_load_cost  */
 573   2, /* vec_unalign_store_cost  */
 574   2, /* vec_store_cost  */
 575   2, /* cond_taken_branch_cost  */
 576   1 /* cond_not_taken_branch_cost  */
 577 };
 578
 579 /* Costs for vector insn classes for Vulcan.  */
 580 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 581 {
 582   1, /* scalar_int_stmt_cost  */
 583   6, /* scalar_fp_stmt_cost  */
 584   4, /* scalar_load_cost  */
 585   1, /* scalar_store_cost  */
 586   5, /* vec_int_stmt_cost  */
 587   6, /* vec_fp_stmt_cost  */
 588   10, /* vec_permute_cost  */
 589   6, /* vec_to_scalar_cost  */
 590   5, /* scalar_to_vec_cost  */
 591   8, /* vec_align_load_cost  */
 592   8, /* vec_unalign_load_cost  */
 593   4, /* vec_unalign_store_cost  */
 594   4, /* vec_store_cost  */
 595   2, /* cond_taken_branch_cost  */
 596   1  /* cond_not_taken_branch_cost  */
 597 };
 598
 599 /* Generic costs for branch instructions.  */
 600 static const struct cpu_branch_cost generic_branch_cost =
 601 {
 602   1,  /* Predictable.  */
 603   3   /* Unpredictable.  */
 604 };
 605
 606 /* Generic approximation modes.  */
 607 static const cpu_approx_modes generic_approx_modes =
 608 {
 609   AARCH64_APPROX_NONE,  /* division  */
 610   AARCH64_APPROX_NONE,  /* sqrt  */
 611   AARCH64_APPROX_NONE   /* recip_sqrt  */
 612 };
 613
 614 /* Approximation modes for Exynos M1.  */
 615 static const cpu_approx_modes exynosm1_approx_modes =
 616 {
 617   AARCH64_APPROX_NONE,  /* division  */
 618   AARCH64_APPROX_ALL,   /* sqrt  */
 619   AARCH64_APPROX_ALL    /* recip_sqrt  */
 620 };
 621
 622 /* Approximation modes for X-Gene 1.  */
 623 static const cpu_approx_modes xgene1_approx_modes =
 624 {
 625   AARCH64_APPROX_NONE,  /* division  */
 626   AARCH64_APPROX_NONE,  /* sqrt  */
 627   AARCH64_APPROX_ALL    /* recip_sqrt  */
 628 };
 629
 630 /* Generic prefetch settings (which disable prefetch).  */
 631 static const cpu_prefetch_tune generic_prefetch_tune =
 632 {
 633   0,                    /* num_slots  */
 634   -1,                   /* l1_cache_size  */
 635   -1,                   /* l1_cache_line_size  */
 636   -1,                   /* l2_cache_size  */
 637   true,                 /* prefetch_dynamic_strides */
 638   -1,                   /* minimum_stride */
 639   -1                    /* default_opt_level  */
 640 };
 641
 642 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 643 {
 644   0,                    /* num_slots  */
 645   -1,                   /* l1_cache_size  */
 646   64,                   /* l1_cache_line_size  */
 647   -1,                   /* l2_cache_size  */
 648   true,                 /* prefetch_dynamic_strides */
 649   -1,                   /* minimum_stride */
 650   -1                    /* default_opt_level  */
 651 };
 652
 653 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 654 {
 655   4,                    /* num_slots  */
 656   32,                   /* l1_cache_size  */
 657   64,                   /* l1_cache_line_size  */
 658   512,                  /* l2_cache_size  */
 659   false,                /* prefetch_dynamic_strides */
 660   2048,                 /* minimum_stride */
 661   3                     /* default_opt_level  */
 662 };
 663
 664 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 665 {
 666   8,                    /* num_slots  */
 667   32,                   /* l1_cache_size  */
 668   128,                  /* l1_cache_line_size  */
 669   16*1024,              /* l2_cache_size  */
 670   true,                 /* prefetch_dynamic_strides */
 671   -1,                   /* minimum_stride */
 672   3                     /* default_opt_level  */
 673 };
 674
 675 static const cpu_prefetch_tune thunderx_prefetch_tune =
 676 {
 677   8,                    /* num_slots  */
 678   32,                   /* l1_cache_size  */
 679   128,                  /* l1_cache_line_size  */
 680   -1,                   /* l2_cache_size  */
 681   true,                 /* prefetch_dynamic_strides */
 682   -1,                   /* minimum_stride */
 683   -1                    /* default_opt_level  */
 684 };
 685
 686 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 687 {
 688   8,                    /* num_slots  */
 689   32,                   /* l1_cache_size  */
 690   64,                   /* l1_cache_line_size  */
 691   256,                  /* l2_cache_size  */
 692   true,                 /* prefetch_dynamic_strides */
 693   -1,                   /* minimum_stride */
 694   -1                    /* default_opt_level  */
 695 };
 696
 697 static const cpu_prefetch_tune tsv110_prefetch_tune =
 698 {
 699   0,                    /* num_slots  */
 700   64,                   /* l1_cache_size  */
 701   64,                   /* l1_cache_line_size  */
 702   512,                  /* l2_cache_size  */
 703   true,                 /* prefetch_dynamic_strides */
 704   -1,                   /* minimum_stride */
 705   -1                    /* default_opt_level  */
 706 };
 707
 708 static const cpu_prefetch_tune xgene1_prefetch_tune =
 709 {
 710   8,                    /* num_slots  */
 711   32,                   /* l1_cache_size  */
 712   64,                   /* l1_cache_line_size  */
 713   256,                  /* l2_cache_size  */
 714   true,                 /* prefetch_dynamic_strides */
 715   -1,                   /* minimum_stride */
 716   -1                    /* default_opt_level  */
 717 };
 718
 719 static const struct tune_params generic_tunings =
 720 {
 721   &cortexa57_extra_costs,
 722   &generic_addrcost_table,
 723   &generic_regmove_cost,
 724   &generic_vector_cost,
 725   &generic_branch_cost,
 726   &generic_approx_modes,
 727   SVE_NOT_IMPLEMENTED, /* sve_width  */
 728   4, /* memmov_cost  */
 729   2, /* issue_rate  */
 730   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 731   "16:12",      /* function_align.  */
 732   "4",  /* jump_align.  */
 733   "8",  /* loop_align.  */
 734   2,    /* int_reassoc_width.  */
 735   4,    /* fp_reassoc_width.  */
 736   1,    /* vec_reassoc_width.  */
 737   2,    /* min_div_recip_mul_sf.  */
 738   2,    /* min_div_recip_mul_df.  */
 739   0,    /* max_case_values.  */
 740   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 741   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 742   &generic_prefetch_tune
 743 };
 744
 745 static const struct tune_params cortexa35_tunings =
 746 {
 747   &cortexa53_extra_costs,
 748   &generic_addrcost_table,
 749   &cortexa53_regmove_cost,
 750   &generic_vector_cost,
 751   &generic_branch_cost,
 752   &generic_approx_modes,
 753   SVE_NOT_IMPLEMENTED, /* sve_width  */
 754   4, /* memmov_cost  */
 755   1, /* issue_rate  */
 756   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 757    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 758   "16", /* function_align.  */
 759   "4",  /* jump_align.  */
 760   "8",  /* loop_align.  */
 761   2,    /* int_reassoc_width.  */
 762   4,    /* fp_reassoc_width.  */
 763   1,    /* vec_reassoc_width.  */
 764   2,    /* min_div_recip_mul_sf.  */
 765   2,    /* min_div_recip_mul_df.  */
 766   0,    /* max_case_values.  */
 767   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 768   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 769   &generic_prefetch_tune
 770 };
 771
 772 static const struct tune_params cortexa53_tunings =
 773 {
 774   &cortexa53_extra_costs,
 775   &generic_addrcost_table,
 776   &cortexa53_regmove_cost,
 777   &generic_vector_cost,
 778   &generic_branch_cost,
 779   &generic_approx_modes,
 780   SVE_NOT_IMPLEMENTED, /* sve_width  */
 781   4, /* memmov_cost  */
 782   2, /* issue_rate  */
 783   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 784    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 785   "16", /* function_align.  */
 786   "4",  /* jump_align.  */
 787   "8",  /* loop_align.  */
 788   2,    /* int_reassoc_width.  */
 789   4,    /* fp_reassoc_width.  */
 790   1,    /* vec_reassoc_width.  */
 791   2,    /* min_div_recip_mul_sf.  */
 792   2,    /* min_div_recip_mul_df.  */
 793   0,    /* max_case_values.  */
 794   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 795   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 796   &generic_prefetch_tune
 797 };
 798
 799 static const struct tune_params cortexa57_tunings =
 800 {
 801   &cortexa57_extra_costs,
 802   &generic_addrcost_table,
 803   &cortexa57_regmove_cost,
 804   &cortexa57_vector_cost,
 805   &generic_branch_cost,
 806   &generic_approx_modes,
 807   SVE_NOT_IMPLEMENTED, /* sve_width  */
 808   4, /* memmov_cost  */
 809   3, /* issue_rate  */
 810   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 811    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 812   "16", /* function_align.  */
 813   "4",  /* jump_align.  */
 814   "8",  /* loop_align.  */
 815   2,    /* int_reassoc_width.  */
 816   4,    /* fp_reassoc_width.  */
 817   1,    /* vec_reassoc_width.  */
 818   2,    /* min_div_recip_mul_sf.  */
 819   2,    /* min_div_recip_mul_df.  */
 820   0,    /* max_case_values.  */
 821   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 822   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 823   &generic_prefetch_tune
 824 };
 825
 826 static const struct tune_params cortexa72_tunings =
 827 {
 828   &cortexa57_extra_costs,
 829   &generic_addrcost_table,
 830   &cortexa57_regmove_cost,
 831   &cortexa57_vector_cost,
 832   &generic_branch_cost,
 833   &generic_approx_modes,
 834   SVE_NOT_IMPLEMENTED, /* sve_width  */
 835   4, /* memmov_cost  */
 836   3, /* issue_rate  */
 837   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 838    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 839   "16", /* function_align.  */
 840   "4",  /* jump_align.  */
 841   "8",  /* loop_align.  */
 842   2,    /* int_reassoc_width.  */
 843   4,    /* fp_reassoc_width.  */
 844   1,    /* vec_reassoc_width.  */
 845   2,    /* min_div_recip_mul_sf.  */
 846   2,    /* min_div_recip_mul_df.  */
 847   0,    /* max_case_values.  */
 848   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 849   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 850   &generic_prefetch_tune
 851 };
 852
 853 static const struct tune_params cortexa73_tunings =
 854 {
 855   &cortexa57_extra_costs,
 856   &generic_addrcost_table,
 857   &cortexa57_regmove_cost,
 858   &cortexa57_vector_cost,
 859   &generic_branch_cost,
 860   &generic_approx_modes,
 861   SVE_NOT_IMPLEMENTED, /* sve_width  */
 862   4, /* memmov_cost.  */
 863   2, /* issue_rate.  */
 864   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 865    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 866   "16", /* function_align.  */
 867   "4",  /* jump_align.  */
 868   "8",  /* loop_align.  */
 869   2,    /* int_reassoc_width.  */
 870   4,    /* fp_reassoc_width.  */
 871   1,    /* vec_reassoc_width.  */
 872   2,    /* min_div_recip_mul_sf.  */
 873   2,    /* min_div_recip_mul_df.  */
 874   0,    /* max_case_values.  */
 875   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 876   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 877   &generic_prefetch_tune
 878 };
 879
 880
 881
 882 static const struct tune_params exynosm1_tunings =
 883 {
 884   &exynosm1_extra_costs,
 885   &exynosm1_addrcost_table,
 886   &exynosm1_regmove_cost,
 887   &exynosm1_vector_cost,
 888   &generic_branch_cost,
 889   &exynosm1_approx_modes,
 890   SVE_NOT_IMPLEMENTED, /* sve_width  */
 891   4,    /* memmov_cost  */
 892   3,    /* issue_rate  */
 893   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 894   "4",  /* function_align.  */
 895   "4",  /* jump_align.  */
 896   "4",  /* loop_align.  */
 897   2,    /* int_reassoc_width.  */
 898   4,    /* fp_reassoc_width.  */
 899   1,    /* vec_reassoc_width.  */
 900   2,    /* min_div_recip_mul_sf.  */
 901   2,    /* min_div_recip_mul_df.  */
 902   48,   /* max_case_values.  */
 903   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 904   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 905   &exynosm1_prefetch_tune
 906 };
 907
 908 static const struct tune_params thunderxt88_tunings =
 909 {
 910   &thunderx_extra_costs,
 911   &generic_addrcost_table,
 912   &thunderx_regmove_cost,
 913   &thunderx_vector_cost,
 914   &generic_branch_cost,
 915   &generic_approx_modes,
 916   SVE_NOT_IMPLEMENTED, /* sve_width  */
 917   6, /* memmov_cost  */
 918   2, /* issue_rate  */
 919   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 920   "8",  /* function_align.  */
 921   "8",  /* jump_align.  */
 922   "8",  /* loop_align.  */
 923   2,    /* int_reassoc_width.  */
 924   4,    /* fp_reassoc_width.  */
 925   1,    /* vec_reassoc_width.  */
 926   2,    /* min_div_recip_mul_sf.  */
 927   2,    /* min_div_recip_mul_df.  */
 928   0,    /* max_case_values.  */
 929   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 930   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 931   &thunderxt88_prefetch_tune
 932 };
 933
 934 static const struct tune_params thunderx_tunings =
 935 {
 936   &thunderx_extra_costs,
 937   &generic_addrcost_table,
 938   &thunderx_regmove_cost,
 939   &thunderx_vector_cost,
 940   &generic_branch_cost,
 941   &generic_approx_modes,
 942   SVE_NOT_IMPLEMENTED, /* sve_width  */
 943   6, /* memmov_cost  */
 944   2, /* issue_rate  */
 945   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 946   "8",  /* function_align.  */
 947   "8",  /* jump_align.  */
 948   "8",  /* loop_align.  */
 949   2,    /* int_reassoc_width.  */
 950   4,    /* fp_reassoc_width.  */
 951   1,    /* vec_reassoc_width.  */
 952   2,    /* min_div_recip_mul_sf.  */
 953   2,    /* min_div_recip_mul_df.  */
 954   0,    /* max_case_values.  */
 955   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 956   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 957    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 958   &thunderx_prefetch_tune
 959 };
 960
 961 static const struct tune_params tsv110_tunings =
 962 {
 963   &tsv110_extra_costs,
 964   &tsv110_addrcost_table,
 965   &tsv110_regmove_cost,
 966   &tsv110_vector_cost,
 967   &generic_branch_cost,
 968   &generic_approx_modes,
 969   SVE_NOT_IMPLEMENTED, /* sve_width  */
 970   4,    /* memmov_cost  */
 971   4,    /* issue_rate  */
 972   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
 973    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 974   "16", /* function_align.  */
 975   "4",  /* jump_align.  */
 976   "8",  /* loop_align.  */
 977   2,    /* int_reassoc_width.  */
 978   4,    /* fp_reassoc_width.  */
 979   1,    /* vec_reassoc_width.  */
 980   2,    /* min_div_recip_mul_sf.  */
 981   2,    /* min_div_recip_mul_df.  */
 982   0,    /* max_case_values.  */
 983   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 984   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
 985   &tsv110_prefetch_tune
 986 };
 987
 988 static const struct tune_params xgene1_tunings =
 989 {
 990   &xgene1_extra_costs,
 991   &xgene1_addrcost_table,
 992   &xgene1_regmove_cost,
 993   &xgene1_vector_cost,
 994   &generic_branch_cost,
 995   &xgene1_approx_modes,
 996   SVE_NOT_IMPLEMENTED, /* sve_width  */
 997   6, /* memmov_cost  */
 998   4, /* issue_rate  */
 999   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1000   "16", /* function_align.  */
1001   "16", /* jump_align.  */
1002   "16", /* loop_align.  */
1003   2,    /* int_reassoc_width.  */
1004   4,    /* fp_reassoc_width.  */
1005   1,    /* vec_reassoc_width.  */
1006   2,    /* min_div_recip_mul_sf.  */
1007   2,    /* min_div_recip_mul_df.  */
1008   17,   /* max_case_values.  */
1009   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1010   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1011   &xgene1_prefetch_tune
1012 };
1013
1014 static const struct tune_params emag_tunings =
1015 {
1016   &xgene1_extra_costs,
1017   &xgene1_addrcost_table,
1018   &xgene1_regmove_cost,
1019   &xgene1_vector_cost,
1020   &generic_branch_cost,
1021   &xgene1_approx_modes,
1022   SVE_NOT_IMPLEMENTED,
1023   6, /* memmov_cost  */
1024   4, /* issue_rate  */
1025   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1026   "16", /* function_align.  */
1027   "16", /* jump_align.  */
1028   "16", /* loop_align.  */
1029   2,    /* int_reassoc_width.  */
1030   4,    /* fp_reassoc_width.  */
1031   1,    /* vec_reassoc_width.  */
1032   2,    /* min_div_recip_mul_sf.  */
1033   2,    /* min_div_recip_mul_df.  */
1034   17,   /* max_case_values.  */
1035   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1036   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1037   &xgene1_prefetch_tune
1038 };
1039
1040 static const struct tune_params qdf24xx_tunings =
1041 {
1042   &qdf24xx_extra_costs,
1043   &qdf24xx_addrcost_table,
1044   &qdf24xx_regmove_cost,
1045   &qdf24xx_vector_cost,
1046   &generic_branch_cost,
1047   &generic_approx_modes,
1048   SVE_NOT_IMPLEMENTED, /* sve_width  */
1049   4, /* memmov_cost  */
1050   4, /* issue_rate  */
1051   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1052    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1053   "16", /* function_align.  */
1054   "8",  /* jump_align.  */
1055   "16", /* loop_align.  */
1056   2,    /* int_reassoc_width.  */
1057   4,    /* fp_reassoc_width.  */
1058   1,    /* vec_reassoc_width.  */
1059   2,    /* min_div_recip_mul_sf.  */
1060   2,    /* min_div_recip_mul_df.  */
1061   0,    /* max_case_values.  */
1062   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1063   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1064   &qdf24xx_prefetch_tune
1065 };
1066
1067 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1068    for now.  */
1069 static const struct tune_params saphira_tunings =
1070 {
1071   &generic_extra_costs,
1072   &generic_addrcost_table,
1073   &generic_regmove_cost,
1074   &generic_vector_cost,
1075   &generic_branch_cost,
1076   &generic_approx_modes,
1077   SVE_NOT_IMPLEMENTED, /* sve_width  */
1078   4, /* memmov_cost  */
1079   4, /* issue_rate  */
1080   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1081    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1082   "16", /* function_align.  */
1083   "8",  /* jump_align.  */
1084   "16", /* loop_align.  */
1085   2,    /* int_reassoc_width.  */
1086   4,    /* fp_reassoc_width.  */
1087   1,    /* vec_reassoc_width.  */
1088   2,    /* min_div_recip_mul_sf.  */
1089   2,    /* min_div_recip_mul_df.  */
1090   0,    /* max_case_values.  */
1091   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1092   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1093   &generic_prefetch_tune
1094 };
1095
1096 static const struct tune_params thunderx2t99_tunings =
1097 {
1098   &thunderx2t99_extra_costs,
1099   &thunderx2t99_addrcost_table,
1100   &thunderx2t99_regmove_cost,
1101   &thunderx2t99_vector_cost,
1102   &generic_branch_cost,
1103   &generic_approx_modes,
1104   SVE_NOT_IMPLEMENTED, /* sve_width  */
1105   4, /* memmov_cost.  */
1106   4, /* issue_rate.  */
1107   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1108    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
1109   "16", /* function_align.  */
1110   "8",  /* jump_align.  */
1111   "16", /* loop_align.  */
1112   3,    /* int_reassoc_width.  */
1113   2,    /* fp_reassoc_width.  */
1114   2,    /* vec_reassoc_width.  */
1115   2,    /* min_div_recip_mul_sf.  */
1116   2,    /* min_div_recip_mul_df.  */
1117   0,    /* max_case_values.  */
1118   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1119   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1120   &thunderx2t99_prefetch_tune
1121 };
1122
1123 static const struct tune_params neoversen1_tunings =
1124 {
1125   &cortexa57_extra_costs,
1126   &generic_addrcost_table,
1127   &generic_regmove_cost,
1128   &cortexa57_vector_cost,
1129   &generic_branch_cost,
1130   &generic_approx_modes,
1131   SVE_NOT_IMPLEMENTED, /* sve_width  */
1132   4, /* memmov_cost  */
1133   3, /* issue_rate  */
1134   AARCH64_FUSE_AES_AESMC, /* fusible_ops  */
1135   "32:16",      /* function_align.  */
1136   "32:16",      /* jump_align.  */
1137   "32:16",      /* loop_align.  */
1138   2,    /* int_reassoc_width.  */
1139   4,    /* fp_reassoc_width.  */
1140   2,    /* vec_reassoc_width.  */
1141   2,    /* min_div_recip_mul_sf.  */
1142   2,    /* min_div_recip_mul_df.  */
1143   0,    /* max_case_values.  */
1144   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1145   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1146   &generic_prefetch_tune
1147 };
1148
1149 /* Support for fine-grained override of the tuning structures.  */
1150 struct aarch64_tuning_override_function
1151 {
1152   const char* name;
1153   void (*parse_override)(const char*, struct tune_params*);
1154 };
1155
1156 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1157 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1158 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1159
1160 static const struct aarch64_tuning_override_function
1161 aarch64_tuning_override_functions[] =
1162 {
1163   { "fuse", aarch64_parse_fuse_string },
1164   { "tune", aarch64_parse_tune_string },
1165   { "sve_width", aarch64_parse_sve_width_string },
1166   { NULL, NULL }
1167 };
1168
1169 /* A processor implementing AArch64.  */
1170 struct processor
1171 {
1172   const char *const name;
1173   enum aarch64_processor ident;
1174   enum aarch64_processor sched_core;
1175   enum aarch64_arch arch;
1176   unsigned architecture_version;
1177   const uint64_t flags;
1178   const struct tune_params *const tune;
1179 };
1180
1181 /* Architectures implementing AArch64.  */
1182 static const struct processor all_architectures[] =
1183 {
1184 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1185   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1186 #include "aarch64-arches.def"
1187   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1188 };
1189
1190 /* Processor cores implementing AArch64.  */
1191 static const struct processor all_cores[] =
1192 {
1193 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1194   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
1195   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
1196   FLAGS, &COSTS##_tunings},
1197 #include "aarch64-cores.def"
1198   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1199     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1200   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1201 };
1202
1203
1204 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1205    handling code or by target attributes.  */
1206 static const struct processor *selected_arch;
1207 static const struct processor *selected_cpu;
1208 static const struct processor *selected_tune;
1209
1210 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1211
1212 /* The current tuning set.  */
1213 struct tune_params aarch64_tune_params = generic_tunings;
1214
1215 /* Check whether an 'aarch64_vector_pcs' attribute is valid.  */
1216
1217 static tree
1218 handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
1219                                      int, bool *no_add_attrs)
1220 {
1221   /* Since we set fn_type_req to true, the caller should have checked
1222      this for us.  */
1223   gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
1224   switch ((arm_pcs) fntype_abi (*node).id ())
1225     {
1226     case ARM_PCS_AAPCS64:
1227     case ARM_PCS_SIMD:
1228       return NULL_TREE;
1229
1230     case ARM_PCS_SVE:
1231       error ("the %qE attribute cannot be applied to an SVE function type",
1232              name);
1233       *no_add_attrs = true;
1234       return NULL_TREE;
1235
1236     case ARM_PCS_TLSDESC:
1237     case ARM_PCS_UNKNOWN:
1238       break;
1239     }
1240   gcc_unreachable ();
1241 }
1242
1243 /* Table of machine attributes.  */
1244 static const struct attribute_spec aarch64_attribute_table[] =
1245 {
1246   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1247        affects_type_identity, handler, exclude } */
1248   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,
1249                           handle_aarch64_vector_pcs_attribute, NULL },
1250   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
1251 };
1252
1253 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1254
1255 /* An ISA extension in the co-processor and main instruction set space.  */
1256 struct aarch64_option_extension
1257 {
1258   const char *const name;
1259   const unsigned long flags_on;
1260   const unsigned long flags_off;
1261 };
1262
1263 typedef enum aarch64_cond_code
1264 {
1265   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1266   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1267   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1268 }
1269 aarch64_cc;
1270
1271 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1272
1273 struct aarch64_branch_protect_type
1274 {
1275   /* The type's name that the user passes to the branch-protection option
1276     string.  */
1277   const char* name;
1278   /* Function to handle the protection type and set global variables.
1279     First argument is the string token corresponding with this type and the
1280     second argument is the next token in the option string.
1281     Return values:
1282     * AARCH64_PARSE_OK: Handling was sucessful.
1283     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1284       should print an error.
1285     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1286       own error.  */
1287   enum aarch64_parse_opt_result (*handler)(char*, char*);
1288   /* A list of types that can follow this type in the option string.  */
1289   const aarch64_branch_protect_type* subtypes;
1290   unsigned int num_subtypes;
1291 };
1292
1293 static enum aarch64_parse_opt_result
1294 aarch64_handle_no_branch_protection (char* str, char* rest)
1295 {
1296   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1297   aarch64_enable_bti = 0;
1298   if (rest)
1299     {
1300       error ("unexpected %<%s%> after %<%s%>", rest, str);
1301       return AARCH64_PARSE_INVALID_FEATURE;
1302     }
1303   return AARCH64_PARSE_OK;
1304 }
1305
1306 static enum aarch64_parse_opt_result
1307 aarch64_handle_standard_branch_protection (char* str, char* rest)
1308 {
1309   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1310   aarch64_ra_sign_key = AARCH64_KEY_A;
1311   aarch64_enable_bti = 1;
1312   if (rest)
1313     {
1314       error ("unexpected %<%s%> after %<%s%>", rest, str);
1315       return AARCH64_PARSE_INVALID_FEATURE;
1316     }
1317   return AARCH64_PARSE_OK;
1318 }
1319
1320 static enum aarch64_parse_opt_result
1321 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1322                                     char* rest ATTRIBUTE_UNUSED)
1323 {
1324   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1325   aarch64_ra_sign_key = AARCH64_KEY_A;
1326   return AARCH64_PARSE_OK;
1327 }
1328
1329 static enum aarch64_parse_opt_result
1330 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1331                               char* rest ATTRIBUTE_UNUSED)
1332 {
1333   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1334   return AARCH64_PARSE_OK;
1335 }
1336
1337 static enum aarch64_parse_opt_result
1338 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1339                               char* rest ATTRIBUTE_UNUSED)
1340 {
1341   aarch64_ra_sign_key = AARCH64_KEY_B;
1342   return AARCH64_PARSE_OK;
1343 }
1344
1345 static enum aarch64_parse_opt_result
1346 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1347                                     char* rest ATTRIBUTE_UNUSED)
1348 {
1349   aarch64_enable_bti = 1;
1350   return AARCH64_PARSE_OK;
1351 }
1352
1353 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1354   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1355   { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1356   { NULL, NULL, NULL, 0 }
1357 };
1358
1359 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1360   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1361   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1362   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1363     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1364   { "bti", aarch64_handle_bti_protection, NULL, 0 },
1365   { NULL, NULL, NULL, 0 }
1366 };
1367
1368 /* The condition codes of the processor, and the inverse function.  */
1369 static const char * const aarch64_condition_codes[] =
1370 {
1371   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1372   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1373 };
1374
1375 /* The preferred condition codes for SVE conditions.  */
1376 static const char *const aarch64_sve_condition_codes[] =
1377 {
1378   "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1379   "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1380 };
1381
1382 /* Return the assembly token for svpattern value VALUE.  */
1383
1384 static const char *
1385 svpattern_token (enum aarch64_svpattern pattern)
1386 {
1387   switch (pattern)
1388     {
1389 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1390     AARCH64_FOR_SVPATTERN (CASE)
1391 #undef CASE
1392     case AARCH64_NUM_SVPATTERNS:
1393       break;
1394     }
1395   gcc_unreachable ();
1396 }
1397
1398 /* Return the descriptor of the SIMD ABI.  */
1399
1400 static const predefined_function_abi &
1401 aarch64_simd_abi (void)
1402 {
1403   predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
1404   if (!simd_abi.initialized_p ())
1405     {
1406       HARD_REG_SET full_reg_clobbers
1407         = default_function_abi.full_reg_clobbers ();
1408       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1409         if (FP_SIMD_SAVED_REGNUM_P (regno))
1410           CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1411       simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
1412     }
1413   return simd_abi;
1414 }
1415
1416 /* Return the descriptor of the SVE PCS.  */
1417
1418 static const predefined_function_abi &
1419 aarch64_sve_abi (void)
1420 {
1421   predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
1422   if (!sve_abi.initialized_p ())
1423     {
1424       HARD_REG_SET full_reg_clobbers
1425         = default_function_abi.full_reg_clobbers ();
1426       for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
1427         CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1428       for (int regno = P4_REGNUM; regno <= P11_REGNUM; ++regno)
1429         CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1430       sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
1431     }
1432   return sve_abi;
1433 }
1434
1435 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1436 const char *
1437 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1438                         const char * branch_format)
1439 {
1440     rtx_code_label * tmp_label = gen_label_rtx ();
1441     char label_buf[256];
1442     char buffer[128];
1443     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1444                                  CODE_LABEL_NUMBER (tmp_label));
1445     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1446     rtx dest_label = operands[pos_label];
1447     operands[pos_label] = tmp_label;
1448
1449     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1450     output_asm_insn (buffer, operands);
1451
1452     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1453     operands[pos_label] = dest_label;
1454     output_asm_insn (buffer, operands);
1455     return "";
1456 }
1457
1458 void
1459 aarch64_err_no_fpadvsimd (machine_mode mode)
1460 {
1461   if (TARGET_GENERAL_REGS_ONLY)
1462     if (FLOAT_MODE_P (mode))
1463       error ("%qs is incompatible with the use of floating-point types",
1464              "-mgeneral-regs-only");
1465     else
1466       error ("%qs is incompatible with the use of vector types",
1467              "-mgeneral-regs-only");
1468   else
1469     if (FLOAT_MODE_P (mode))
1470       error ("%qs feature modifier is incompatible with the use of"
1471              " floating-point types", "+nofp");
1472     else
1473       error ("%qs feature modifier is incompatible with the use of"
1474              " vector types", "+nofp");
1475 }
1476
1477 /* Return true if REGNO is P0-P15 or one of the special FFR-related
1478    registers.  */
1479 inline bool
1480 pr_or_ffr_regnum_p (unsigned int regno)
1481 {
1482   return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
1483 }
1484
1485 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1486    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1487    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1488    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1489    and GENERAL_REGS is lower than the memory cost (in this case the best class
1490    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1491    cost results in bad allocations with many redundant int<->FP moves which
1492    are expensive on various cores.
1493    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1494    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1495    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1496    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1497    The result of this is that it is no longer inefficient to have a higher
1498    memory move cost than the register move cost.
1499 */
1500
1501 static reg_class_t
1502 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1503                                          reg_class_t best_class)
1504 {
1505   machine_mode mode;
1506
1507   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1508       || !reg_class_subset_p (FP_REGS, allocno_class))
1509     return allocno_class;
1510
1511   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1512       || !reg_class_subset_p (FP_REGS, best_class))
1513     return best_class;
1514
1515   mode = PSEUDO_REGNO_MODE (regno);
1516   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1517 }
1518
1519 static unsigned int
1520 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1521 {
1522   if (GET_MODE_UNIT_SIZE (mode) == 4)
1523     return aarch64_tune_params.min_div_recip_mul_sf;
1524   return aarch64_tune_params.min_div_recip_mul_df;
1525 }
1526
1527 /* Return the reassociation width of treeop OPC with mode MODE.  */
1528 static int
1529 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1530 {
1531   if (VECTOR_MODE_P (mode))
1532     return aarch64_tune_params.vec_reassoc_width;
1533   if (INTEGRAL_MODE_P (mode))
1534     return aarch64_tune_params.int_reassoc_width;
1535   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1536   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1537     return aarch64_tune_params.fp_reassoc_width;
1538   return 1;
1539 }
1540
1541 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1542 unsigned
1543 aarch64_dbx_register_number (unsigned regno)
1544 {
1545    if (GP_REGNUM_P (regno))
1546      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1547    else if (regno == SP_REGNUM)
1548      return AARCH64_DWARF_SP;
1549    else if (FP_REGNUM_P (regno))
1550      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1551    else if (PR_REGNUM_P (regno))
1552      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1553    else if (regno == VG_REGNUM)
1554      return AARCH64_DWARF_VG;
1555
1556    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1557       equivalent DWARF register.  */
1558    return DWARF_FRAME_REGISTERS;
1559 }
1560
1561 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1562    integer, otherwise return X unmodified.  */
1563 static rtx
1564 aarch64_bit_representation (rtx x)
1565 {
1566   if (CONST_DOUBLE_P (x))
1567     x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
1568   return x;
1569 }
1570
1571 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1572 static bool
1573 aarch64_advsimd_struct_mode_p (machine_mode mode)
1574 {
1575   return (TARGET_SIMD
1576           && (mode == OImode || mode == CImode || mode == XImode));
1577 }
1578
1579 /* Return true if MODE is an SVE predicate mode.  */
1580 static bool
1581 aarch64_sve_pred_mode_p (machine_mode mode)
1582 {
1583   return (TARGET_SVE
1584           && (mode == VNx16BImode
1585               || mode == VNx8BImode
1586               || mode == VNx4BImode
1587               || mode == VNx2BImode));
1588 }
1589
1590 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1591 const unsigned int VEC_ADVSIMD  = 1;
1592 const unsigned int VEC_SVE_DATA = 2;
1593 const unsigned int VEC_SVE_PRED = 4;
1594 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1595    a structure of 2, 3 or 4 vectors.  */
1596 const unsigned int VEC_STRUCT   = 8;
1597 /* Can be used in combination with VEC_SVE_DATA to indicate that the
1598    vector has fewer significant bytes than a full SVE vector.  */
1599 const unsigned int VEC_PARTIAL  = 16;
1600 /* Useful combinations of the above.  */
1601 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1602 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1603
1604 /* Return a set of flags describing the vector properties of mode MODE.
1605    Ignore modes that are not supported by the current target.  */
1606 static unsigned int
1607 aarch64_classify_vector_mode (machine_mode mode)
1608 {
1609   if (aarch64_advsimd_struct_mode_p (mode))
1610     return VEC_ADVSIMD | VEC_STRUCT;
1611
1612   if (aarch64_sve_pred_mode_p (mode))
1613     return VEC_SVE_PRED;
1614
1615   /* Make the decision based on the mode's enum value rather than its
1616      properties, so that we keep the correct classification regardless
1617      of -msve-vector-bits.  */
1618   switch (mode)
1619     {
1620     /* Partial SVE QI vectors.  */
1621     case E_VNx2QImode:
1622     case E_VNx4QImode:
1623     case E_VNx8QImode:
1624     /* Partial SVE HI vectors.  */
1625     case E_VNx2HImode:
1626     case E_VNx4HImode:
1627     /* Partial SVE SI vector.  */
1628     case E_VNx2SImode:
1629       return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
1630
1631     case E_VNx16QImode:
1632     case E_VNx8HImode:
1633     case E_VNx4SImode:
1634     case E_VNx2DImode:
1635     case E_VNx8HFmode:
1636     case E_VNx4SFmode:
1637     case E_VNx2DFmode:
1638       return TARGET_SVE ? VEC_SVE_DATA : 0;
1639
1640     /* x2 SVE vectors.  */
1641     case E_VNx32QImode:
1642     case E_VNx16HImode:
1643     case E_VNx8SImode:
1644     case E_VNx4DImode:
1645     case E_VNx16HFmode:
1646     case E_VNx8SFmode:
1647     case E_VNx4DFmode:
1648     /* x3 SVE vectors.  */
1649     case E_VNx48QImode:
1650     case E_VNx24HImode:
1651     case E_VNx12SImode:
1652     case E_VNx6DImode:
1653     case E_VNx24HFmode:
1654     case E_VNx12SFmode:
1655     case E_VNx6DFmode:
1656     /* x4 SVE vectors.  */
1657     case E_VNx64QImode:
1658     case E_VNx32HImode:
1659     case E_VNx16SImode:
1660     case E_VNx8DImode:
1661     case E_VNx32HFmode:
1662     case E_VNx16SFmode:
1663     case E_VNx8DFmode:
1664       return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
1665
1666     /* 64-bit Advanced SIMD vectors.  */
1667     case E_V8QImode:
1668     case E_V4HImode:
1669     case E_V2SImode:
1670     /* ...E_V1DImode doesn't exist.  */
1671     case E_V4HFmode:
1672     case E_V2SFmode:
1673     case E_V1DFmode:
1674     /* 128-bit Advanced SIMD vectors.  */
1675     case E_V16QImode:
1676     case E_V8HImode:
1677     case E_V4SImode:
1678     case E_V2DImode:
1679     case E_V8HFmode:
1680     case E_V4SFmode:
1681     case E_V2DFmode:
1682       return TARGET_SIMD ? VEC_ADVSIMD : 0;
1683
1684     default:
1685       return 0;
1686     }
1687 }
1688
1689 /* Return true if MODE is any of the data vector modes, including
1690    structure modes.  */
1691 static bool
1692 aarch64_vector_data_mode_p (machine_mode mode)
1693 {
1694   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1695 }
1696
1697 /* Return true if MODE is any form of SVE mode, including predicates,
1698    vectors and structures.  */
1699 bool
1700 aarch64_sve_mode_p (machine_mode mode)
1701 {
1702   return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
1703 }
1704
1705 /* Return true if MODE is an SVE data vector mode; either a single vector
1706    or a structure of vectors.  */
1707 static bool
1708 aarch64_sve_data_mode_p (machine_mode mode)
1709 {
1710   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1711 }
1712
1713 /* Return the number of defined bytes in one constituent vector of
1714    SVE mode MODE, which has vector flags VEC_FLAGS.  */
1715 static poly_int64
1716 aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
1717 {
1718   if (vec_flags & VEC_PARTIAL)
1719     /* A single partial vector.  */
1720     return GET_MODE_SIZE (mode);
1721
1722   if (vec_flags & VEC_SVE_DATA)
1723     /* A single vector or a tuple.  */
1724     return BYTES_PER_SVE_VECTOR;
1725
1726   /* A single predicate.  */
1727   gcc_assert (vec_flags & VEC_SVE_PRED);
1728   return BYTES_PER_SVE_PRED;
1729 }
1730
1731 /* Implement target hook TARGET_ARRAY_MODE.  */
1732 static opt_machine_mode
1733 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1734 {
1735   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1736       && IN_RANGE (nelems, 2, 4))
1737     return mode_for_vector (GET_MODE_INNER (mode),
1738                             GET_MODE_NUNITS (mode) * nelems);
1739
1740   return opt_machine_mode ();
1741 }
1742
1743 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1744 static bool
1745 aarch64_array_mode_supported_p (machine_mode mode,
1746                                 unsigned HOST_WIDE_INT nelems)
1747 {
1748   if (TARGET_SIMD
1749       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1750           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1751       && (nelems >= 2 && nelems <= 4))
1752     return true;
1753
1754   return false;
1755 }
1756
1757 /* Return the SVE predicate mode to use for elements that have
1758    ELEM_NBYTES bytes, if such a mode exists.  */
1759
1760 opt_machine_mode
1761 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1762 {
1763   if (TARGET_SVE)
1764     {
1765       if (elem_nbytes == 1)
1766         return VNx16BImode;
1767       if (elem_nbytes == 2)
1768         return VNx8BImode;
1769       if (elem_nbytes == 4)
1770         return VNx4BImode;
1771       if (elem_nbytes == 8)
1772         return VNx2BImode;
1773     }
1774   return opt_machine_mode ();
1775 }
1776
1777 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1778
1779 static opt_machine_mode
1780 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1781 {
1782   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1783     {
1784       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1785       machine_mode pred_mode;
1786       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1787         return pred_mode;
1788     }
1789
1790   return default_get_mask_mode (nunits, nbytes);
1791 }
1792
1793 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE.  */
1794
1795 opt_machine_mode
1796 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
1797 {
1798   enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
1799                             ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
1800   machine_mode mode;
1801   FOR_EACH_MODE_IN_CLASS (mode, mclass)
1802     if (inner_mode == GET_MODE_INNER (mode)
1803         && known_eq (nunits, GET_MODE_NUNITS (mode))
1804         && aarch64_sve_data_mode_p (mode))
1805       return mode;
1806   return opt_machine_mode ();
1807 }
1808
1809 /* Return the integer element mode associated with SVE mode MODE.  */
1810
1811 static scalar_int_mode
1812 aarch64_sve_element_int_mode (machine_mode mode)
1813 {
1814   unsigned int elt_bits = vector_element_size (BITS_PER_SVE_VECTOR,
1815                                                GET_MODE_NUNITS (mode));
1816   return int_mode_for_size (elt_bits, 0).require ();
1817 }
1818
1819 /* Return the integer vector mode associated with SVE mode MODE.
1820    Unlike mode_for_int_vector, this can handle the case in which
1821    MODE is a predicate (and thus has a different total size).  */
1822
1823 machine_mode
1824 aarch64_sve_int_mode (machine_mode mode)
1825 {
1826   scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
1827   return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
1828 }
1829
1830 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
1831    prefer to use the first arithmetic operand as the else value if
1832    the else value doesn't matter, since that exactly matches the SVE
1833    destructive merging form.  For ternary operations we could either
1834    pick the first operand and use FMAD-like instructions or the last
1835    operand and use FMLA-like instructions; the latter seems more
1836    natural.  */
1837
1838 static tree
1839 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1840 {
1841   return nops == 3 ? ops[2] : ops[0];
1842 }
1843
1844 /* Implement TARGET_HARD_REGNO_NREGS.  */
1845
1846 static unsigned int
1847 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1848 {
1849   /* ??? Logically we should only need to provide a value when
1850      HARD_REGNO_MODE_OK says that the combination is valid,
1851      but at the moment we need to handle all modes.  Just ignore
1852      any runtime parts for registers that can't store them.  */
1853   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1854   switch (aarch64_regno_regclass (regno))
1855     {
1856     case FP_REGS:
1857     case FP_LO_REGS:
1858     case FP_LO8_REGS:
1859       {
1860         unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1861         if (vec_flags & VEC_SVE_DATA)
1862           return exact_div (GET_MODE_SIZE (mode),
1863                             aarch64_vl_bytes (mode, vec_flags)).to_constant ();
1864         return CEIL (lowest_size, UNITS_PER_VREG);
1865       }
1866     case PR_REGS:
1867     case PR_LO_REGS:
1868     case PR_HI_REGS:
1869     case FFR_REGS:
1870     case PR_AND_FFR_REGS:
1871       return 1;
1872     default:
1873       return CEIL (lowest_size, UNITS_PER_WORD);
1874     }
1875   gcc_unreachable ();
1876 }
1877
1878 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1879
1880 static bool
1881 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1882 {
1883   if (GET_MODE_CLASS (mode) == MODE_CC)
1884     return regno == CC_REGNUM;
1885
1886   if (regno == VG_REGNUM)
1887     /* This must have the same size as _Unwind_Word.  */
1888     return mode == DImode;
1889
1890   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1891   /* At the moment, partial vector modes are only useful for memory
1892      references, but that could change in future.  */
1893   if (vec_flags & VEC_PARTIAL)
1894     return false;
1895
1896   if (vec_flags & VEC_SVE_PRED)
1897     return pr_or_ffr_regnum_p (regno);
1898
1899   if (pr_or_ffr_regnum_p (regno))
1900     return false;
1901
1902   if (regno == SP_REGNUM)
1903     /* The purpose of comparing with ptr_mode is to support the
1904        global register variable associated with the stack pointer
1905        register via the syntax of asm ("wsp") in ILP32.  */
1906     return mode == Pmode || mode == ptr_mode;
1907
1908   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1909     return mode == Pmode;
1910
1911   if (GP_REGNUM_P (regno))
1912     {
1913       if (known_le (GET_MODE_SIZE (mode), 8))
1914         return true;
1915       else if (known_le (GET_MODE_SIZE (mode), 16))
1916         return (regno & 1) == 0;
1917     }
1918   else if (FP_REGNUM_P (regno))
1919     {
1920       if (vec_flags & VEC_STRUCT)
1921         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1922       else
1923         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1924     }
1925
1926   return false;
1927 }
1928
1929 /* Return true if TYPE is a type that should be passed or returned in
1930    SVE registers, assuming enough registers are available.  When returning
1931    true, set *NUM_ZR and *NUM_PR to the number of required Z and P registers
1932    respectively.  */
1933
1934 static bool
1935 aarch64_sve_argument_p (const_tree type, unsigned int *num_zr,
1936                         unsigned int *num_pr)
1937 {
1938   if (aarch64_sve::svbool_type_p (type))
1939     {
1940       *num_pr = 1;
1941       *num_zr = 0;
1942       return true;
1943     }
1944
1945   if (unsigned int nvectors = aarch64_sve::nvectors_if_data_type (type))
1946     {
1947       *num_pr = 0;
1948       *num_zr = nvectors;
1949       return true;
1950     }
1951
1952   return false;
1953 }
1954
1955 /* Return true if a function with type FNTYPE returns its value in
1956    SVE vector or predicate registers.  */
1957
1958 static bool
1959 aarch64_returns_value_in_sve_regs_p (const_tree fntype)
1960 {
1961   unsigned int num_zr, num_pr;
1962   tree return_type = TREE_TYPE (fntype);
1963   return (return_type != error_mark_node
1964           && aarch64_sve_argument_p (return_type, &num_zr, &num_pr));
1965 }
1966
1967 /* Return true if a function with type FNTYPE takes arguments in
1968    SVE vector or predicate registers.  */
1969
1970 static bool
1971 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
1972 {
1973   CUMULATIVE_ARGS args_so_far_v;
1974   aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
1975                                 NULL_TREE, 0, true);
1976   cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
1977
1978   for (tree chain = TYPE_ARG_TYPES (fntype);
1979        chain && chain != void_list_node;
1980        chain = TREE_CHAIN (chain))
1981     {
1982       tree arg_type = TREE_VALUE (chain);
1983       if (arg_type == error_mark_node)
1984         return false;
1985
1986       function_arg_info arg (arg_type, /*named=*/true);
1987       apply_pass_by_reference_rules (&args_so_far_v, arg);
1988       unsigned int num_zr, num_pr;
1989       if (aarch64_sve_argument_p (arg.type, &num_zr, &num_pr))
1990         return true;
1991
1992       targetm.calls.function_arg_advance (args_so_far, arg);
1993     }
1994   return false;
1995 }
1996
1997 /* Implement TARGET_FNTYPE_ABI.  */
1998
1999 static const predefined_function_abi &
2000 aarch64_fntype_abi (const_tree fntype)
2001 {
2002   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
2003     return aarch64_simd_abi ();
2004
2005   if (aarch64_returns_value_in_sve_regs_p (fntype)
2006       || aarch64_takes_arguments_in_sve_regs_p (fntype))
2007     return aarch64_sve_abi ();
2008
2009   return default_function_abi;
2010 }
2011
2012 /* Return true if we should emit CFI for register REGNO.  */
2013
2014 static bool
2015 aarch64_emit_cfi_for_reg_p (unsigned int regno)
2016 {
2017   return (GP_REGNUM_P (regno)
2018           || !default_function_abi.clobbers_full_reg_p (regno));
2019 }
2020
2021 /* Return the mode we should use to save and restore register REGNO.  */
2022
2023 static machine_mode
2024 aarch64_reg_save_mode (unsigned int regno)
2025 {
2026   if (GP_REGNUM_P (regno))
2027     return DImode;
2028
2029   if (FP_REGNUM_P (regno))
2030     switch (crtl->abi->id ())
2031       {
2032       case ARM_PCS_AAPCS64:
2033         /* Only the low 64 bits are saved by the base PCS.  */
2034         return DFmode;
2035
2036       case ARM_PCS_SIMD:
2037         /* The vector PCS saves the low 128 bits (which is the full
2038            register on non-SVE targets).  */
2039         return TFmode;
2040
2041       case ARM_PCS_SVE:
2042         /* Use vectors of DImode for registers that need frame
2043            information, so that the first 64 bytes of the save slot
2044            are always the equivalent of what storing D<n> would give.  */
2045         if (aarch64_emit_cfi_for_reg_p (regno))
2046           return VNx2DImode;
2047
2048         /* Use vectors of bytes otherwise, so that the layout is
2049            endian-agnostic, and so that we can use LDR and STR for
2050            big-endian targets.  */
2051         return VNx16QImode;
2052
2053       case ARM_PCS_TLSDESC:
2054       case ARM_PCS_UNKNOWN:
2055         break;
2056       }
2057
2058   if (PR_REGNUM_P (regno))
2059     /* Save the full predicate register.  */
2060     return VNx16BImode;
2061
2062   gcc_unreachable ();
2063 }
2064
2065 /* Implement TARGET_INSN_CALLEE_ABI.  */
2066
2067 const predefined_function_abi &
2068 aarch64_insn_callee_abi (const rtx_insn *insn)
2069 {
2070   rtx pat = PATTERN (insn);
2071   gcc_assert (GET_CODE (pat) == PARALLEL);
2072   rtx unspec = XVECEXP (pat, 0, 1);
2073   gcc_assert (GET_CODE (unspec) == UNSPEC
2074               && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
2075   return function_abis[INTVAL (XVECEXP (unspec, 0, 0))];
2076 }
2077
2078 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
2079    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
2080    clobbers the top 64 bits when restoring the bottom 64 bits.  */
2081
2082 static bool
2083 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
2084                                         unsigned int regno,
2085                                         machine_mode mode)
2086 {
2087   if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
2088     {
2089       poly_int64 per_register_size = GET_MODE_SIZE (mode);
2090       unsigned int nregs = hard_regno_nregs (regno, mode);
2091       if (nregs > 1)
2092         per_register_size = exact_div (per_register_size, nregs);
2093       if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
2094         return maybe_gt (per_register_size, 16);
2095       return maybe_gt (per_register_size, 8);
2096     }
2097   return false;
2098 }
2099
2100 /* Implement REGMODE_NATURAL_SIZE.  */
2101 poly_uint64
2102 aarch64_regmode_natural_size (machine_mode mode)
2103 {
2104   /* The natural size for SVE data modes is one SVE data vector,
2105      and similarly for predicates.  We can't independently modify
2106      anything smaller than that.  */
2107   /* ??? For now, only do this for variable-width SVE registers.
2108      Doing it for constant-sized registers breaks lower-subreg.c.  */
2109   /* ??? And once that's fixed, we should probably have similar
2110      code for Advanced SIMD.  */
2111   if (!aarch64_sve_vg.is_constant ())
2112     {
2113       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2114       if (vec_flags & VEC_SVE_PRED)
2115         return BYTES_PER_SVE_PRED;
2116       if (vec_flags & VEC_SVE_DATA)
2117         return BYTES_PER_SVE_VECTOR;
2118     }
2119   return UNITS_PER_WORD;
2120 }
2121
2122 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
2123 machine_mode
2124 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
2125                                      machine_mode mode)
2126 {
2127   /* The predicate mode determines which bits are significant and
2128      which are "don't care".  Decreasing the number of lanes would
2129      lose data while increasing the number of lanes would make bits
2130      unnecessarily significant.  */
2131   if (PR_REGNUM_P (regno))
2132     return mode;
2133   if (known_ge (GET_MODE_SIZE (mode), 4))
2134     return mode;
2135   else
2136     return SImode;
2137 }
2138
2139 /* Return true if I's bits are consecutive ones from the MSB.  */
2140 bool
2141 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
2142 {
2143   return exact_log2 (-i) != HOST_WIDE_INT_M1;
2144 }
2145
2146 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
2147    that strcpy from constants will be faster.  */
2148
2149 static HOST_WIDE_INT
2150 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
2151 {
2152   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
2153     return MAX (align, BITS_PER_WORD);
2154   return align;
2155 }
2156
2157 /* Return true if calls to DECL should be treated as
2158    long-calls (ie called via a register).  */
2159 static bool
2160 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
2161 {
2162   return false;
2163 }
2164
2165 /* Return true if calls to symbol-ref SYM should be treated as
2166    long-calls (ie called via a register).  */
2167 bool
2168 aarch64_is_long_call_p (rtx sym)
2169 {
2170   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
2171 }
2172
2173 /* Return true if calls to symbol-ref SYM should not go through
2174    plt stubs.  */
2175
2176 bool
2177 aarch64_is_noplt_call_p (rtx sym)
2178 {
2179   const_tree decl = SYMBOL_REF_DECL (sym);
2180
2181   if (flag_pic
2182       && decl
2183       && (!flag_plt
2184           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
2185       && !targetm.binds_local_p (decl))
2186     return true;
2187
2188   return false;
2189 }
2190
2191 /* Return true if the offsets to a zero/sign-extract operation
2192    represent an expression that matches an extend operation.  The
2193    operands represent the paramters from
2194
2195    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
2196 bool
2197 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
2198                                 rtx extract_imm)
2199 {
2200   HOST_WIDE_INT mult_val, extract_val;
2201
2202   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
2203     return false;
2204
2205   mult_val = INTVAL (mult_imm);
2206   extract_val = INTVAL (extract_imm);
2207
2208   if (extract_val > 8
2209       && extract_val < GET_MODE_BITSIZE (mode)
2210       && exact_log2 (extract_val & ~7) > 0
2211       && (extract_val & 7) <= 4
2212       && mult_val == (1 << (extract_val & 7)))
2213     return true;
2214
2215   return false;
2216 }
2217
2218 /* Emit an insn that's a simple single-set.  Both the operands must be
2219    known to be valid.  */
2220 inline static rtx_insn *
2221 emit_set_insn (rtx x, rtx y)
2222 {
2223   return emit_insn (gen_rtx_SET (x, y));
2224 }
2225
2226 /* X and Y are two things to compare using CODE.  Emit the compare insn and
2227    return the rtx for register 0 in the proper mode.  */
2228 rtx
2229 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2230 {
2231   machine_mode cmp_mode = GET_MODE (x);
2232   machine_mode cc_mode;
2233   rtx cc_reg;
2234
2235   if (cmp_mode == TImode)
2236     {
2237       gcc_assert (code == NE);
2238
2239       cc_mode = CCmode;
2240       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2241
2242       rtx x_lo = operand_subword (x, 0, 0, TImode);
2243       rtx y_lo = operand_subword (y, 0, 0, TImode);
2244       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
2245
2246       rtx x_hi = operand_subword (x, 1, 0, TImode);
2247       rtx y_hi = operand_subword (y, 1, 0, TImode);
2248       emit_insn (gen_ccmpdi (cc_reg, cc_reg, x_hi, y_hi,
2249                              gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
2250                              GEN_INT (AARCH64_EQ)));
2251     }
2252   else
2253     {
2254       cc_mode = SELECT_CC_MODE (code, x, y);
2255       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2256       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
2257     }
2258   return cc_reg;
2259 }
2260
2261 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
2262
2263 static rtx
2264 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2265                                   machine_mode y_mode)
2266 {
2267   if (y_mode == E_QImode || y_mode == E_HImode)
2268     {
2269       if (CONST_INT_P (y))
2270         y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2271       else
2272         {
2273           rtx t, cc_reg;
2274           machine_mode cc_mode;
2275
2276           t = gen_rtx_ZERO_EXTEND (SImode, y);
2277           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2278           cc_mode = CC_SWPmode;
2279           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2280           emit_set_insn (cc_reg, t);
2281           return cc_reg;
2282         }
2283     }
2284
2285   if (!aarch64_plus_operand (y, y_mode))
2286     y = force_reg (y_mode, y);
2287
2288   return aarch64_gen_compare_reg (code, x, y);
2289 }
2290
2291 /* Build the SYMBOL_REF for __tls_get_addr.  */
2292
2293 static GTY(()) rtx tls_get_addr_libfunc;
2294
2295 rtx
2296 aarch64_tls_get_addr (void)
2297 {
2298   if (!tls_get_addr_libfunc)
2299     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2300   return tls_get_addr_libfunc;
2301 }
2302
2303 /* Return the TLS model to use for ADDR.  */
2304
2305 static enum tls_model
2306 tls_symbolic_operand_type (rtx addr)
2307 {
2308   enum tls_model tls_kind = TLS_MODEL_NONE;
2309   if (GET_CODE (addr) == CONST)
2310     {
2311       poly_int64 addend;
2312       rtx sym = strip_offset (addr, &addend);
2313       if (GET_CODE (sym) == SYMBOL_REF)
2314         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2315     }
2316   else if (GET_CODE (addr) == SYMBOL_REF)
2317     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2318
2319   return tls_kind;
2320 }
2321
2322 /* We'll allow lo_sum's in addresses in our legitimate addresses
2323    so that combine would take care of combining addresses where
2324    necessary, but for generation purposes, we'll generate the address
2325    as :
2326    RTL                               Absolute
2327    tmp = hi (symbol_ref);            adrp  x1, foo
2328    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
2329                                      nop
2330
2331    PIC                               TLS
2332    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
2333    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
2334                                      bl   __tls_get_addr
2335                                      nop
2336
2337    Load TLS symbol, depending on TLS mechanism and TLS access model.
2338
2339    Global Dynamic - Traditional TLS:
2340    adrp tmp, :tlsgd:imm
2341    add  dest, tmp, #:tlsgd_lo12:imm
2342    bl   __tls_get_addr
2343
2344    Global Dynamic - TLS Descriptors:
2345    adrp dest, :tlsdesc:imm
2346    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
2347    add  dest, dest, #:tlsdesc_lo12:imm
2348    blr  tmp
2349    mrs  tp, tpidr_el0
2350    add  dest, dest, tp
2351
2352    Initial Exec:
2353    mrs  tp, tpidr_el0
2354    adrp tmp, :gottprel:imm
2355    ldr  dest, [tmp, #:gottprel_lo12:imm]
2356    add  dest, dest, tp
2357
2358    Local Exec:
2359    mrs  tp, tpidr_el0
2360    add  t0, tp, #:tprel_hi12:imm, lsl #12
2361    add  t0, t0, #:tprel_lo12_nc:imm
2362 */
2363
2364 static void
2365 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2366                                    enum aarch64_symbol_type type)
2367 {
2368   switch (type)
2369     {
2370     case SYMBOL_SMALL_ABSOLUTE:
2371       {
2372         /* In ILP32, the mode of dest can be either SImode or DImode.  */
2373         rtx tmp_reg = dest;
2374         machine_mode mode = GET_MODE (dest);
2375
2376         gcc_assert (mode == Pmode || mode == ptr_mode);
2377
2378         if (can_create_pseudo_p ())
2379           tmp_reg = gen_reg_rtx (mode);
2380
2381         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2382         emit_insn (gen_add_losym (dest, tmp_reg, imm));
2383         return;
2384       }
2385
2386     case SYMBOL_TINY_ABSOLUTE:
2387       emit_insn (gen_rtx_SET (dest, imm));
2388       return;
2389
2390     case SYMBOL_SMALL_GOT_28K:
2391       {
2392         machine_mode mode = GET_MODE (dest);
2393         rtx gp_rtx = pic_offset_table_rtx;
2394         rtx insn;
2395         rtx mem;
2396
2397         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2398            here before rtl expand.  Tree IVOPT will generate rtl pattern to
2399            decide rtx costs, in which case pic_offset_table_rtx is not
2400            initialized.  For that case no need to generate the first adrp
2401            instruction as the final cost for global variable access is
2402            one instruction.  */
2403         if (gp_rtx != NULL)
2404           {
2405             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2406                using the page base as GOT base, the first page may be wasted,
2407                in the worst scenario, there is only 28K space for GOT).
2408
2409                The generate instruction sequence for accessing global variable
2410                is:
2411
2412                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2413
2414                Only one instruction needed. But we must initialize
2415                pic_offset_table_rtx properly.  We generate initialize insn for
2416                every global access, and allow CSE to remove all redundant.
2417
2418                The final instruction sequences will look like the following
2419                for multiply global variables access.
2420
2421                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2422
2423                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2424                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2425                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2426                  ...  */
2427
2428             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2429             crtl->uses_pic_offset_table = 1;
2430             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2431
2432             if (mode != GET_MODE (gp_rtx))
2433              gp_rtx = gen_lowpart (mode, gp_rtx);
2434
2435           }
2436
2437         if (mode == ptr_mode)
2438           {
2439             if (mode == DImode)
2440               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2441             else
2442               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2443
2444             mem = XVECEXP (SET_SRC (insn), 0, 0);
2445           }
2446         else
2447           {
2448             gcc_assert (mode == Pmode);
2449
2450             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2451             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2452           }
2453
2454         /* The operand is expected to be MEM.  Whenever the related insn
2455            pattern changed, above code which calculate mem should be
2456            updated.  */
2457         gcc_assert (GET_CODE (mem) == MEM);
2458         MEM_READONLY_P (mem) = 1;
2459         MEM_NOTRAP_P (mem) = 1;
2460         emit_insn (insn);
2461         return;
2462       }
2463
2464     case SYMBOL_SMALL_GOT_4G:
2465       {
2466         /* In ILP32, the mode of dest can be either SImode or DImode,
2467            while the got entry is always of SImode size.  The mode of
2468            dest depends on how dest is used: if dest is assigned to a
2469            pointer (e.g. in the memory), it has SImode; it may have
2470            DImode if dest is dereferenced to access the memeory.
2471            This is why we have to handle three different ldr_got_small
2472            patterns here (two patterns for ILP32).  */
2473
2474         rtx insn;
2475         rtx mem;
2476         rtx tmp_reg = dest;
2477         machine_mode mode = GET_MODE (dest);
2478
2479         if (can_create_pseudo_p ())
2480           tmp_reg = gen_reg_rtx (mode);
2481
2482         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2483         if (mode == ptr_mode)
2484           {
2485             if (mode == DImode)
2486               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2487             else
2488               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2489
2490             mem = XVECEXP (SET_SRC (insn), 0, 0);
2491           }
2492         else
2493           {
2494             gcc_assert (mode == Pmode);
2495
2496             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2497             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2498           }
2499
2500         gcc_assert (GET_CODE (mem) == MEM);
2501         MEM_READONLY_P (mem) = 1;
2502         MEM_NOTRAP_P (mem) = 1;
2503         emit_insn (insn);
2504         return;
2505       }
2506
2507     case SYMBOL_SMALL_TLSGD:
2508       {
2509         rtx_insn *insns;
2510         machine_mode mode = GET_MODE (dest);
2511         rtx result = gen_rtx_REG (mode, R0_REGNUM);
2512
2513         start_sequence ();
2514         if (TARGET_ILP32)
2515           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2516         else
2517           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2518         insns = get_insns ();
2519         end_sequence ();
2520
2521         RTL_CONST_CALL_P (insns) = 1;
2522         emit_libcall_block (insns, dest, result, imm);
2523         return;
2524       }
2525
2526     case SYMBOL_SMALL_TLSDESC:
2527       {
2528         machine_mode mode = GET_MODE (dest);
2529         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2530         rtx tp;
2531
2532         gcc_assert (mode == Pmode || mode == ptr_mode);
2533
2534         /* In ILP32, the got entry is always of SImode size.  Unlike
2535            small GOT, the dest is fixed at reg 0.  */
2536         if (TARGET_ILP32)
2537           emit_insn (gen_tlsdesc_small_si (imm));
2538         else
2539           emit_insn (gen_tlsdesc_small_di (imm));
2540         tp = aarch64_load_tp (NULL);
2541
2542         if (mode != Pmode)
2543           tp = gen_lowpart (mode, tp);
2544
2545         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2546         if (REG_P (dest))
2547           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2548         return;
2549       }
2550
2551     case SYMBOL_SMALL_TLSIE:
2552       {
2553         /* In ILP32, the mode of dest can be either SImode or DImode,
2554            while the got entry is always of SImode size.  The mode of
2555            dest depends on how dest is used: if dest is assigned to a
2556            pointer (e.g. in the memory), it has SImode; it may have
2557            DImode if dest is dereferenced to access the memeory.
2558            This is why we have to handle three different tlsie_small
2559            patterns here (two patterns for ILP32).  */
2560         machine_mode mode = GET_MODE (dest);
2561         rtx tmp_reg = gen_reg_rtx (mode);
2562         rtx tp = aarch64_load_tp (NULL);
2563
2564         if (mode == ptr_mode)
2565           {
2566             if (mode == DImode)
2567               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2568             else
2569               {
2570                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2571                 tp = gen_lowpart (mode, tp);
2572               }
2573           }
2574         else
2575           {
2576             gcc_assert (mode == Pmode);
2577             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2578           }
2579
2580         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2581         if (REG_P (dest))
2582           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2583         return;
2584       }
2585
2586     case SYMBOL_TLSLE12:
2587     case SYMBOL_TLSLE24:
2588     case SYMBOL_TLSLE32:
2589     case SYMBOL_TLSLE48:
2590       {
2591         machine_mode mode = GET_MODE (dest);
2592         rtx tp = aarch64_load_tp (NULL);
2593
2594         if (mode != Pmode)
2595           tp = gen_lowpart (mode, tp);
2596
2597         switch (type)
2598           {
2599           case SYMBOL_TLSLE12:
2600             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2601                         (dest, tp, imm));
2602             break;
2603           case SYMBOL_TLSLE24:
2604             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2605                         (dest, tp, imm));
2606           break;
2607           case SYMBOL_TLSLE32:
2608             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2609                         (dest, imm));
2610             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2611                         (dest, dest, tp));
2612           break;
2613           case SYMBOL_TLSLE48:
2614             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2615                         (dest, imm));
2616             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2617                         (dest, dest, tp));
2618             break;
2619           default:
2620             gcc_unreachable ();
2621           }
2622
2623         if (REG_P (dest))
2624           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2625         return;
2626       }
2627
2628     case SYMBOL_TINY_GOT:
2629       emit_insn (gen_ldr_got_tiny (dest, imm));
2630       return;
2631
2632     case SYMBOL_TINY_TLSIE:
2633       {
2634         machine_mode mode = GET_MODE (dest);
2635         rtx tp = aarch64_load_tp (NULL);
2636
2637         if (mode == ptr_mode)
2638           {
2639             if (mode == DImode)
2640               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2641             else
2642               {
2643                 tp = gen_lowpart (mode, tp);
2644                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2645               }
2646           }
2647         else
2648           {
2649             gcc_assert (mode == Pmode);
2650             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2651           }
2652
2653         if (REG_P (dest))
2654           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2655         return;
2656       }
2657
2658     default:
2659       gcc_unreachable ();
2660     }
2661 }
2662
2663 /* Emit a move from SRC to DEST.  Assume that the move expanders can
2664    handle all moves if !can_create_pseudo_p ().  The distinction is
2665    important because, unlike emit_move_insn, the move expanders know
2666    how to force Pmode objects into the constant pool even when the
2667    constant pool address is not itself legitimate.  */
2668 static rtx
2669 aarch64_emit_move (rtx dest, rtx src)
2670 {
2671   return (can_create_pseudo_p ()
2672           ? emit_move_insn (dest, src)
2673           : emit_move_insn_1 (dest, src));
2674 }
2675
2676 /* Apply UNOPTAB to OP and store the result in DEST.  */
2677
2678 static void
2679 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2680 {
2681   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2682   if (dest != tmp)
2683     emit_move_insn (dest, tmp);
2684 }
2685
2686 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
2687
2688 static void
2689 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2690 {
2691   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2692                           OPTAB_DIRECT);
2693   if (dest != tmp)
2694     emit_move_insn (dest, tmp);
2695 }
2696
2697 /* Split a 128-bit move operation into two 64-bit move operations,
2698    taking care to handle partial overlap of register to register
2699    copies.  Special cases are needed when moving between GP regs and
2700    FP regs.  SRC can be a register, constant or memory; DST a register
2701    or memory.  If either operand is memory it must not have any side
2702    effects.  */
2703 void
2704 aarch64_split_128bit_move (rtx dst, rtx src)
2705 {
2706   rtx dst_lo, dst_hi;
2707   rtx src_lo, src_hi;
2708
2709   machine_mode mode = GET_MODE (dst);
2710
2711   gcc_assert (mode == TImode || mode == TFmode);
2712   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2713   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2714
2715   if (REG_P (dst) && REG_P (src))
2716     {
2717       int src_regno = REGNO (src);
2718       int dst_regno = REGNO (dst);
2719
2720       /* Handle FP <-> GP regs.  */
2721       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2722         {
2723           src_lo = gen_lowpart (word_mode, src);
2724           src_hi = gen_highpart (word_mode, src);
2725
2726           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2727           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2728           return;
2729         }
2730       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2731         {
2732           dst_lo = gen_lowpart (word_mode, dst);
2733           dst_hi = gen_highpart (word_mode, dst);
2734
2735           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2736           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2737           return;
2738         }
2739     }
2740
2741   dst_lo = gen_lowpart (word_mode, dst);
2742   dst_hi = gen_highpart (word_mode, dst);
2743   src_lo = gen_lowpart (word_mode, src);
2744   src_hi = gen_highpart_mode (word_mode, mode, src);
2745
2746   /* At most one pairing may overlap.  */
2747   if (reg_overlap_mentioned_p (dst_lo, src_hi))
2748     {
2749       aarch64_emit_move (dst_hi, src_hi);
2750       aarch64_emit_move (dst_lo, src_lo);
2751     }
2752   else
2753     {
2754       aarch64_emit_move (dst_lo, src_lo);
2755       aarch64_emit_move (dst_hi, src_hi);
2756     }
2757 }
2758
2759 bool
2760 aarch64_split_128bit_move_p (rtx dst, rtx src)
2761 {
2762   return (! REG_P (src)
2763           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2764 }
2765
2766 /* Split a complex SIMD combine.  */
2767
2768 void
2769 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2770 {
2771   machine_mode src_mode = GET_MODE (src1);
2772   machine_mode dst_mode = GET_MODE (dst);
2773
2774   gcc_assert (VECTOR_MODE_P (dst_mode));
2775   gcc_assert (register_operand (dst, dst_mode)
2776               && register_operand (src1, src_mode)
2777               && register_operand (src2, src_mode));
2778
2779   emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2780   return;
2781 }
2782
2783 /* Split a complex SIMD move.  */
2784
2785 void
2786 aarch64_split_simd_move (rtx dst, rtx src)
2787 {
2788   machine_mode src_mode = GET_MODE (src);
2789   machine_mode dst_mode = GET_MODE (dst);
2790
2791   gcc_assert (VECTOR_MODE_P (dst_mode));
2792
2793   if (REG_P (dst) && REG_P (src))
2794     {
2795       gcc_assert (VECTOR_MODE_P (src_mode));
2796       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2797     }
2798 }
2799
2800 bool
2801 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2802                               machine_mode ymode, rtx y)
2803 {
2804   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2805   gcc_assert (r != NULL);
2806   return rtx_equal_p (x, r);
2807 }
2808
2809 /* Return TARGET if it is nonnull and a register of mode MODE.
2810    Otherwise, return a fresh register of mode MODE if we can,
2811    or TARGET reinterpreted as MODE if we can't.  */
2812
2813 static rtx
2814 aarch64_target_reg (rtx target, machine_mode mode)
2815 {
2816   if (target && REG_P (target) && GET_MODE (target) == mode)
2817     return target;
2818   if (!can_create_pseudo_p ())
2819     {
2820       gcc_assert (target);
2821       return gen_lowpart (mode, target);
2822     }
2823   return gen_reg_rtx (mode);
2824 }
2825
2826 /* Return a register that contains the constant in BUILDER, given that
2827    the constant is a legitimate move operand.  Use TARGET as the register
2828    if it is nonnull and convenient.  */
2829
2830 static rtx
2831 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
2832 {
2833   rtx src = builder.build ();
2834   target = aarch64_target_reg (target, GET_MODE (src));
2835   emit_insn (gen_rtx_SET (target, src));
2836   return target;
2837 }
2838
2839 static rtx
2840 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2841 {
2842   if (can_create_pseudo_p ())
2843     return force_reg (mode, value);
2844   else
2845     {
2846       gcc_assert (x);
2847       aarch64_emit_move (x, value);
2848       return x;
2849     }
2850 }
2851
2852 /* Return true if predicate value X is a constant in which every element
2853    is a CONST_INT.  When returning true, describe X in BUILDER as a VNx16BI
2854    value, i.e. as a predicate in which all bits are significant.  */
2855
2856 static bool
2857 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
2858 {
2859   if (GET_CODE (x) != CONST_VECTOR)
2860     return false;
2861
2862   unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
2863                                              GET_MODE_NUNITS (GET_MODE (x)));
2864   unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
2865   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
2866   builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
2867
2868   unsigned int nelts = const_vector_encoded_nelts (x);
2869   for (unsigned int i = 0; i < nelts; ++i)
2870     {
2871       rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
2872       if (!CONST_INT_P (elt))
2873         return false;
2874
2875       builder.quick_push (elt);
2876       for (unsigned int j = 1; j < factor; ++j)
2877         builder.quick_push (const0_rtx);
2878     }
2879   builder.finalize ();
2880   return true;
2881 }
2882
2883 /* BUILDER contains a predicate constant of mode VNx16BI.  Return the
2884    widest predicate element size it can have (that is, the largest size
2885    for which each element would still be 0 or 1).  */
2886
2887 unsigned int
2888 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
2889 {
2890   /* Start with the most optimistic assumption: that we only need
2891      one bit per pattern.  This is what we will use if only the first
2892      bit in each pattern is ever set.  */
2893   unsigned int mask = GET_MODE_SIZE (DImode);
2894   mask |= builder.npatterns ();
2895
2896   /* Look for set bits.  */
2897   unsigned int nelts = builder.encoded_nelts ();
2898   for (unsigned int i = 1; i < nelts; ++i)
2899     if (INTVAL (builder.elt (i)) != 0)
2900       {
2901         if (i & 1)
2902           return 1;
2903         mask |= i;
2904       }
2905   return mask & -mask;
2906 }
2907
2908 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
2909    return that predicate mode, otherwise return opt_machine_mode ().  */
2910
2911 opt_machine_mode
2912 aarch64_ptrue_all_mode (rtx x)
2913 {
2914   gcc_assert (GET_MODE (x) == VNx16BImode);
2915   if (GET_CODE (x) != CONST_VECTOR
2916       || !CONST_VECTOR_DUPLICATE_P (x)
2917       || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
2918       || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
2919     return opt_machine_mode ();
2920
2921   unsigned int nelts = const_vector_encoded_nelts (x);
2922   for (unsigned int i = 1; i < nelts; ++i)
2923     if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
2924       return opt_machine_mode ();
2925
2926   return aarch64_sve_pred_mode (nelts);
2927 }
2928
2929 /* BUILDER is a predicate constant of mode VNx16BI.  Consider the value
2930    that the constant would have with predicate element size ELT_SIZE
2931    (ignoring the upper bits in each element) and return:
2932
2933    * -1 if all bits are set
2934    * N if the predicate has N leading set bits followed by all clear bits
2935    * 0 if the predicate does not have any of these forms.  */
2936
2937 int
2938 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
2939                               unsigned int elt_size)
2940 {
2941   /* If nelts_per_pattern is 3, we have set bits followed by clear bits
2942      followed by set bits.  */
2943   if (builder.nelts_per_pattern () == 3)
2944     return 0;
2945
2946   /* Skip over leading set bits.  */
2947   unsigned int nelts = builder.encoded_nelts ();
2948   unsigned int i = 0;
2949   for (; i < nelts; i += elt_size)
2950     if (INTVAL (builder.elt (i)) == 0)
2951       break;
2952   unsigned int vl = i / elt_size;
2953
2954   /* Check for the all-true case.  */
2955   if (i == nelts)
2956     return -1;
2957
2958   /* If nelts_per_pattern is 1, then either VL is zero, or we have a
2959      repeating pattern of set bits followed by clear bits.  */
2960   if (builder.nelts_per_pattern () != 2)
2961     return 0;
2962
2963   /* We have a "foreground" value and a duplicated "background" value.
2964      If the background might repeat and the last set bit belongs to it,
2965      we might have set bits followed by clear bits followed by set bits.  */
2966   if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
2967     return 0;
2968
2969   /* Make sure that the rest are all clear.  */
2970   for (; i < nelts; i += elt_size)
2971     if (INTVAL (builder.elt (i)) != 0)
2972       return 0;
2973
2974   return vl;
2975 }
2976
2977 /* See if there is an svpattern that encodes an SVE predicate of mode
2978    PRED_MODE in which the first VL bits are set and the rest are clear.
2979    Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
2980    A VL of -1 indicates an all-true vector.  */
2981
2982 aarch64_svpattern
2983 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
2984 {
2985   if (vl < 0)
2986     return AARCH64_SV_ALL;
2987
2988   if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
2989     return AARCH64_NUM_SVPATTERNS;
2990
2991   if (vl >= 1 && vl <= 8)
2992     return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
2993
2994   if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
2995     return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
2996
2997   int max_vl;
2998   if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
2999     {
3000       if (vl == (max_vl / 3) * 3)
3001         return AARCH64_SV_MUL3;
3002       /* These would only trigger for non-power-of-2 lengths.  */
3003       if (vl == (max_vl & -4))
3004         return AARCH64_SV_MUL4;
3005       if (vl == (1 << floor_log2 (max_vl)))
3006         return AARCH64_SV_POW2;
3007       if (vl == max_vl)
3008         return AARCH64_SV_ALL;
3009     }
3010   return AARCH64_NUM_SVPATTERNS;
3011 }
3012
3013 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
3014    bits has the lowest bit set and the upper bits clear.  This is the
3015    VNx16BImode equivalent of a PTRUE for controlling elements of
3016    ELT_SIZE bytes.  However, because the constant is VNx16BImode,
3017    all bits are significant, even the upper zeros.  */
3018
3019 rtx
3020 aarch64_ptrue_all (unsigned int elt_size)
3021 {
3022   rtx_vector_builder builder (VNx16BImode, elt_size, 1);
3023   builder.quick_push (const1_rtx);
3024   for (unsigned int i = 1; i < elt_size; ++i)
3025     builder.quick_push (const0_rtx);
3026   return builder.build ();
3027 }
3028
3029 /* Return an all-true predicate register of mode MODE.  */
3030
3031 rtx
3032 aarch64_ptrue_reg (machine_mode mode)
3033 {
3034   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
3035   rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3036   return gen_lowpart (mode, reg);
3037 }
3038
3039 /* Return an all-false predicate register of mode MODE.  */
3040
3041 rtx
3042 aarch64_pfalse_reg (machine_mode mode)
3043 {
3044   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
3045   rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
3046   return gen_lowpart (mode, reg);
3047 }
3048
3049 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
3050    true, or alternatively if we know that the operation predicated by
3051    PRED1[0] is safe to perform whenever PRED2 is true.  PRED1[1] is a
3052    aarch64_sve_gp_strictness operand that describes the operation
3053    predicated by PRED1[0].  */
3054
3055 bool
3056 aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
3057 {
3058   machine_mode mode = GET_MODE (pred2);
3059   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3060               && mode == GET_MODE (pred1[0])
3061               && aarch64_sve_gp_strictness (pred1[1], SImode));
3062   return (pred1[0] == CONSTM1_RTX (mode)
3063           || INTVAL (pred1[1]) == SVE_RELAXED_GP
3064           || rtx_equal_p (pred1[0], pred2));
3065 }
3066
3067 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
3068    for it.  PRED2[0] is the predicate for the instruction whose result
3069    is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
3070    for it.  Return true if we can prove that the two predicates are
3071    equivalent for PTEST purposes; that is, if we can replace PRED2[0]
3072    with PRED1[0] without changing behavior.  */
3073
3074 bool
3075 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
3076 {
3077   machine_mode mode = GET_MODE (pred1[0]);
3078   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3079               && mode == GET_MODE (pred2[0])
3080               && aarch64_sve_ptrue_flag (pred1[1], SImode)
3081               && aarch64_sve_ptrue_flag (pred2[1], SImode));
3082
3083   bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
3084                    || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
3085   bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
3086                    || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
3087   return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
3088 }
3089
3090 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
3091    DATA_MODE, and return the result in a predicate of mode PRED_MODE.
3092    Use TARGET as the target register if nonnull and convenient.  */
3093
3094 static rtx
3095 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
3096                           machine_mode data_mode, rtx op1, rtx op2)
3097 {
3098   insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
3099   expand_operand ops[5];
3100   create_output_operand (&ops[0], target, pred_mode);
3101   create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
3102   create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
3103   create_input_operand (&ops[3], op1, data_mode);
3104   create_input_operand (&ops[4], op2, data_mode);
3105   expand_insn (icode, 5, ops);
3106   return ops[0].value;
3107 }
3108
3109 /* Use a comparison to convert integer vector SRC into MODE, which is
3110    the corresponding SVE predicate mode.  Use TARGET for the result
3111    if it's nonnull and convenient.  */
3112
3113 rtx
3114 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
3115 {
3116   machine_mode src_mode = GET_MODE (src);
3117   return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
3118                                    src, CONST0_RTX (src_mode));
3119 }
3120
3121 /* Return the assembly token for svprfop value PRFOP.  */
3122
3123 static const char *
3124 svprfop_token (enum aarch64_svprfop prfop)
3125 {
3126   switch (prfop)
3127     {
3128 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
3129     AARCH64_FOR_SVPRFOP (CASE)
3130 #undef CASE
3131     case AARCH64_NUM_SVPRFOPS:
3132       break;
3133     }
3134   gcc_unreachable ();
3135 }
3136
3137 /* Return the assembly string for an SVE prefetch operation with
3138    mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
3139    and that SUFFIX is the format for the remaining operands.  */
3140
3141 char *
3142 aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
3143                              const char *suffix)
3144 {
3145   static char buffer[128];
3146   aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
3147   unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
3148                                    mnemonic, svprfop_token (prfop), suffix);
3149   gcc_assert (written < sizeof (buffer));
3150   return buffer;
3151 }
3152
3153 /* Check whether we can calculate the number of elements in PATTERN
3154    at compile time, given that there are NELTS_PER_VQ elements per
3155    128-bit block.  Return the value if so, otherwise return -1.  */
3156
3157 HOST_WIDE_INT
3158 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
3159 {
3160   unsigned int vl, const_vg;
3161   if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
3162     vl = 1 + (pattern - AARCH64_SV_VL1);
3163   else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
3164     vl = 16 << (pattern - AARCH64_SV_VL16);
3165   else if (aarch64_sve_vg.is_constant (&const_vg))
3166     {
3167       /* There are two vector granules per quadword.  */
3168       unsigned int nelts = (const_vg / 2) * nelts_per_vq;
3169       switch (pattern)
3170         {
3171         case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
3172         case AARCH64_SV_MUL4: return nelts & -4;
3173         case AARCH64_SV_MUL3: return (nelts / 3) * 3;
3174         case AARCH64_SV_ALL: return nelts;
3175         default: gcc_unreachable ();
3176         }
3177     }
3178   else
3179     return -1;
3180
3181   /* There are two vector granules per quadword.  */
3182   poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
3183   if (known_le (vl, nelts_all))
3184     return vl;
3185
3186   /* Requesting more elements than are available results in a PFALSE.  */
3187   if (known_gt (vl, nelts_all))
3188     return 0;
3189
3190   return -1;
3191 }
3192
3193 /* Return true if we can move VALUE into a register using a single
3194    CNT[BHWD] instruction.  */
3195
3196 static bool
3197 aarch64_sve_cnt_immediate_p (poly_int64 value)
3198 {
3199   HOST_WIDE_INT factor = value.coeffs[0];
3200   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
3201   return (value.coeffs[1] == factor
3202           && IN_RANGE (factor, 2, 16 * 16)
3203           && (factor & 1) == 0
3204           && factor <= 16 * (factor & -factor));
3205 }
3206
3207 /* Likewise for rtx X.  */
3208
3209 bool
3210 aarch64_sve_cnt_immediate_p (rtx x)
3211 {
3212   poly_int64 value;
3213   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
3214 }
3215
3216 /* Return the asm string for an instruction with a CNT-like vector size
3217    operand (a vector pattern followed by a multiplier in the range [1, 16]).
3218    PREFIX is the mnemonic without the size suffix and OPERANDS is the
3219    first part of the operands template (the part that comes before the
3220    vector size itself).  PATTERN is the pattern to use.  FACTOR is the
3221    number of quadwords.  NELTS_PER_VQ, if nonzero, is the number of elements
3222    in each quadword.  If it is zero, we can use any element size.  */
3223
3224 static char *
3225 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3226                                   aarch64_svpattern pattern,
3227                                   unsigned int factor,
3228                                   unsigned int nelts_per_vq)
3229 {
3230   static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
3231
3232   if (nelts_per_vq == 0)
3233     /* There is some overlap in the ranges of the four CNT instructions.
3234        Here we always use the smallest possible element size, so that the
3235        multiplier is 1 whereever possible.  */
3236     nelts_per_vq = factor & -factor;
3237   int shift = std::min (exact_log2 (nelts_per_vq), 4);
3238   gcc_assert (IN_RANGE (shift, 1, 4));
3239   char suffix = "dwhb"[shift - 1];
3240
3241   factor >>= shift;
3242   unsigned int written;
3243   if (pattern == AARCH64_SV_ALL && factor == 1)
3244     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
3245                         prefix, suffix, operands);
3246   else if (factor == 1)
3247     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
3248                         prefix, suffix, operands, svpattern_token (pattern));
3249   else
3250     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
3251                         prefix, suffix, operands, svpattern_token (pattern),
3252                         factor);
3253   gcc_assert (written < sizeof (buffer));
3254   return buffer;
3255 }
3256
3257 /* Return the asm string for an instruction with a CNT-like vector size
3258    operand (a vector pattern followed by a multiplier in the range [1, 16]).
3259    PREFIX is the mnemonic without the size suffix and OPERANDS is the
3260    first part of the operands template (the part that comes before the
3261    vector size itself).  X is the value of the vector size operand,
3262    as a polynomial integer rtx; we need to convert this into an "all"
3263    pattern with a multiplier.  */
3264
3265 char *
3266 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3267                                   rtx x)
3268 {
3269   poly_int64 value = rtx_to_poly_int64 (x);
3270   gcc_assert (aarch64_sve_cnt_immediate_p (value));
3271   return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
3272                                            value.coeffs[1], 0);
3273 }
3274
3275 /* Return the asm string for an instruction with a CNT-like vector size
3276    operand (a vector pattern followed by a multiplier in the range [1, 16]).
3277    PREFIX is the mnemonic without the size suffix and OPERANDS is the
3278    first part of the operands template (the part that comes before the
3279    vector size itself).  CNT_PAT[0..2] are the operands of the
3280    UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details.  */
3281
3282 char *
3283 aarch64_output_sve_cnt_pat_immediate (const char *prefix,
3284                                       const char *operands, rtx *cnt_pat)
3285 {
3286   aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
3287   unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
3288   unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
3289   return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
3290                                            factor, nelts_per_vq);
3291 }
3292
3293 /* Return true if we can add X using a single SVE INC or DEC instruction.  */
3294
3295 bool
3296 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
3297 {
3298   poly_int64 value;
3299   return (poly_int_rtx_p (x, &value)
3300           && (aarch64_sve_cnt_immediate_p (value)
3301               || aarch64_sve_cnt_immediate_p (-value)));
3302 }
3303
3304 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
3305    operand 0.  */
3306
3307 char *
3308 aarch64_output_sve_scalar_inc_dec (rtx offset)
3309 {
3310   poly_int64 offset_value = rtx_to_poly_int64 (offset);
3311   gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
3312   if (offset_value.coeffs[1] > 0)
3313     return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
3314                                              offset_value.coeffs[1], 0);
3315   else
3316     return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
3317                                              -offset_value.coeffs[1], 0);
3318 }
3319
3320 /* Return true if we can add VALUE to a register using a single ADDVL
3321    or ADDPL instruction.  */
3322
3323 static bool
3324 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
3325 {
3326   HOST_WIDE_INT factor = value.coeffs[0];
3327   if (factor == 0 || value.coeffs[1] != factor)
3328     return false;
3329   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
3330      and a value of 16 is one vector width.  */
3331   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
3332           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
3333 }
3334
3335 /* Likewise for rtx X.  */
3336
3337 bool
3338 aarch64_sve_addvl_addpl_immediate_p (rtx x)
3339 {
3340   poly_int64 value;
3341   return (poly_int_rtx_p (x, &value)
3342           && aarch64_sve_addvl_addpl_immediate_p (value));
3343 }
3344
3345 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3346    to operand 1 and storing the result in operand 0.  */
3347
3348 char *
3349 aarch64_output_sve_addvl_addpl (rtx offset)
3350 {
3351   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3352   poly_int64 offset_value = rtx_to_poly_int64 (offset);
3353   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
3354
3355   int factor = offset_value.coeffs[1];
3356   if ((factor & 15) == 0)
3357     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
3358   else
3359     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
3360   return buffer;
3361 }
3362
3363 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3364    instruction.  If it is, store the number of elements in each vector
3365    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3366    factor in *FACTOR_OUT (if nonnull).  */
3367
3368 bool
3369 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
3370                                         unsigned int *nelts_per_vq_out)
3371 {
3372   rtx elt;
3373   poly_int64 value;
3374
3375   if (!const_vec_duplicate_p (x, &elt)
3376       || !poly_int_rtx_p (elt, &value))
3377     return false;
3378
3379   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
3380   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
3381     /* There's no vector INCB.  */
3382     return false;
3383
3384   HOST_WIDE_INT factor = value.coeffs[0];
3385   if (value.coeffs[1] != factor)
3386     return false;
3387
3388   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
3389   if ((factor % nelts_per_vq) != 0
3390       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
3391     return false;
3392
3393   if (factor_out)
3394     *factor_out = factor;
3395   if (nelts_per_vq_out)
3396     *nelts_per_vq_out = nelts_per_vq;
3397   return true;
3398 }
3399
3400 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3401    instruction.  */
3402
3403 bool
3404 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
3405 {
3406   return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
3407 }
3408
3409 /* Return the asm template for an SVE vector INC or DEC instruction.
3410    OPERANDS gives the operands before the vector count and X is the
3411    value of the vector count operand itself.  */
3412
3413 char *
3414 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
3415 {
3416   int factor;
3417   unsigned int nelts_per_vq;
3418   if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
3419     gcc_unreachable ();
3420   if (factor < 0)
3421     return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
3422                                              -factor, nelts_per_vq);
3423   else
3424     return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
3425                                              factor, nelts_per_vq);
3426 }
3427
3428 static int
3429 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
3430                                 scalar_int_mode mode)
3431 {
3432   int i;
3433   unsigned HOST_WIDE_INT val, val2, mask;
3434   int one_match, zero_match;
3435   int num_insns;
3436
3437   val = INTVAL (imm);
3438
3439   if (aarch64_move_imm (val, mode))
3440     {
3441       if (generate)
3442         emit_insn (gen_rtx_SET (dest, imm));
3443       return 1;
3444     }
3445
3446   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3447      (with XXXX non-zero). In that case check to see if the move can be done in
3448      a smaller mode.  */
3449   val2 = val & 0xffffffff;
3450   if (mode == DImode
3451       && aarch64_move_imm (val2, SImode)
3452       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
3453     {
3454       if (generate)
3455         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3456
3457       /* Check if we have to emit a second instruction by checking to see
3458          if any of the upper 32 bits of the original DI mode value is set.  */
3459       if (val == val2)
3460         return 1;
3461
3462       i = (val >> 48) ? 48 : 32;
3463
3464       if (generate)
3465          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3466                                     GEN_INT ((val >> i) & 0xffff)));
3467
3468       return 2;
3469     }
3470
3471   if ((val >> 32) == 0 || mode == SImode)
3472     {
3473       if (generate)
3474         {
3475           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
3476           if (mode == SImode)
3477             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
3478                                        GEN_INT ((val >> 16) & 0xffff)));
3479           else
3480             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
3481                                        GEN_INT ((val >> 16) & 0xffff)));
3482         }
3483       return 2;
3484     }
3485
3486   /* Remaining cases are all for DImode.  */
3487
3488   mask = 0xffff;
3489   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
3490     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
3491   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
3492     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
3493
3494   if (zero_match != 2 && one_match != 2)
3495     {
3496       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3497          For a 64-bit bitmask try whether changing 16 bits to all ones or
3498          zeroes creates a valid bitmask.  To check any repeated bitmask,
3499          try using 16 bits from the other 32-bit half of val.  */
3500
3501       for (i = 0; i < 64; i += 16, mask <<= 16)
3502         {
3503           val2 = val & ~mask;
3504           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3505             break;
3506           val2 = val | mask;
3507           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3508             break;
3509           val2 = val2 & ~mask;
3510           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
3511           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3512             break;
3513         }
3514       if (i != 64)
3515         {
3516           if (generate)
3517             {
3518               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3519               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3520                                          GEN_INT ((val >> i) & 0xffff)));
3521             }
3522           return 2;
3523         }
3524     }
3525
3526   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3527      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
3528      otherwise skip zero bits.  */
3529
3530   num_insns = 1;
3531   mask = 0xffff;
3532   val2 = one_match > zero_match ? ~val : val;
3533   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
3534
3535   if (generate)
3536     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
3537                                            ? (val | ~(mask << i))
3538                                            : (val & (mask << i)))));
3539   for (i += 16; i < 64; i += 16)
3540     {
3541       if ((val2 & (mask << i)) == 0)
3542         continue;
3543       if (generate)
3544         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3545                                    GEN_INT ((val >> i) & 0xffff)));
3546       num_insns ++;
3547     }
3548
3549   return num_insns;
3550 }
3551
3552 /* Return whether imm is a 128-bit immediate which is simple enough to
3553    expand inline.  */
3554 bool
3555 aarch64_mov128_immediate (rtx imm)
3556 {
3557   if (GET_CODE (imm) == CONST_INT)
3558     return true;
3559
3560   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
3561
3562   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
3563   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
3564
3565   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
3566          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
3567 }
3568
3569
3570 /* Return the number of temporary registers that aarch64_add_offset_1
3571    would need to add OFFSET to a register.  */
3572
3573 static unsigned int
3574 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
3575 {
3576   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
3577 }
3578
3579 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
3580    a non-polynomial OFFSET.  MODE is the mode of the addition.
3581    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3582    be set and CFA adjustments added to the generated instructions.
3583
3584    TEMP1, if nonnull, is a register of mode MODE that can be used as a
3585    temporary if register allocation is already complete.  This temporary
3586    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
3587    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3588    the immediate again.
3589
3590    Since this function may be used to adjust the stack pointer, we must
3591    ensure that it cannot cause transient stack deallocation (for example
3592    by first incrementing SP and then decrementing when adjusting by a
3593    large immediate).  */
3594
3595 static void
3596 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
3597                       rtx src, HOST_WIDE_INT offset, rtx temp1,
3598                       bool frame_related_p, bool emit_move_imm)
3599 {
3600   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3601   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3602
3603   HOST_WIDE_INT moffset = abs_hwi (offset);
3604   rtx_insn *insn;
3605
3606   if (!moffset)
3607     {
3608       if (!rtx_equal_p (dest, src))
3609         {
3610           insn = emit_insn (gen_rtx_SET (dest, src));
3611           RTX_FRAME_RELATED_P (insn) = frame_related_p;
3612         }
3613       return;
3614     }
3615
3616   /* Single instruction adjustment.  */
3617   if (aarch64_uimm12_shift (moffset))
3618     {
3619       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
3620       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3621       return;
3622     }
3623
3624   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3625      and either:
3626
3627      a) the offset cannot be loaded by a 16-bit move or
3628      b) there is no spare register into which we can move it.  */
3629   if (moffset < 0x1000000
3630       && ((!temp1 && !can_create_pseudo_p ())
3631           || !aarch64_move_imm (moffset, mode)))
3632     {
3633       HOST_WIDE_INT low_off = moffset & 0xfff;
3634
3635       low_off = offset < 0 ? -low_off : low_off;
3636       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
3637       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3638       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
3639       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3640       return;
3641     }
3642
3643   /* Emit a move immediate if required and an addition/subtraction.  */
3644   if (emit_move_imm)
3645     {
3646       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
3647       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
3648     }
3649   insn = emit_insn (offset < 0
3650                     ? gen_sub3_insn (dest, src, temp1)
3651                     : gen_add3_insn (dest, src, temp1));
3652   if (frame_related_p)
3653     {
3654       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3655       rtx adj = plus_constant (mode, src, offset);
3656       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
3657     }
3658 }
3659
3660 /* Return the number of temporary registers that aarch64_add_offset
3661    would need to move OFFSET into a register or add OFFSET to a register;
3662    ADD_P is true if we want the latter rather than the former.  */
3663
3664 static unsigned int
3665 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
3666 {
3667   /* This follows the same structure as aarch64_add_offset.  */
3668   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
3669     return 0;
3670
3671   unsigned int count = 0;
3672   HOST_WIDE_INT factor = offset.coeffs[1];
3673   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3674   poly_int64 poly_offset (factor, factor);
3675   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3676     /* Need one register for the ADDVL/ADDPL result.  */
3677     count += 1;
3678   else if (factor != 0)
3679     {
3680       factor = abs (factor);
3681       if (factor > 16 * (factor & -factor))
3682         /* Need one register for the CNT result and one for the multiplication
3683            factor.  If necessary, the second temporary can be reused for the
3684            constant part of the offset.  */
3685         return 2;
3686       /* Need one register for the CNT result (which might then
3687          be shifted).  */
3688       count += 1;
3689     }
3690   return count + aarch64_add_offset_1_temporaries (constant);
3691 }
3692
3693 /* If X can be represented as a poly_int64, return the number
3694    of temporaries that are required to add it to a register.
3695    Return -1 otherwise.  */
3696
3697 int
3698 aarch64_add_offset_temporaries (rtx x)
3699 {
3700   poly_int64 offset;
3701   if (!poly_int_rtx_p (x, &offset))
3702     return -1;
3703   return aarch64_offset_temporaries (true, offset);
3704 }
3705
3706 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
3707    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3708    be set and CFA adjustments added to the generated instructions.
3709
3710    TEMP1, if nonnull, is a register of mode MODE that can be used as a
3711    temporary if register allocation is already complete.  This temporary
3712    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3713    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3714    false to avoid emitting the immediate again.
3715
3716    TEMP2, if nonnull, is a second temporary register that doesn't
3717    overlap either DEST or REG.
3718
3719    Since this function may be used to adjust the stack pointer, we must
3720    ensure that it cannot cause transient stack deallocation (for example
3721    by first incrementing SP and then decrementing when adjusting by a
3722    large immediate).  */
3723
3724 static void
3725 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3726                     poly_int64 offset, rtx temp1, rtx temp2,
3727                     bool frame_related_p, bool emit_move_imm = true)
3728 {
3729   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3730   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3731   gcc_assert (temp1 == NULL_RTX
3732               || !frame_related_p
3733               || !reg_overlap_mentioned_p (temp1, dest));
3734   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
3735
3736   /* Try using ADDVL or ADDPL to add the whole value.  */
3737   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
3738     {
3739       rtx offset_rtx = gen_int_mode (offset, mode);
3740       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3741       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3742       return;
3743     }
3744
3745   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3746      SVE vector register, over and above the minimum size of 128 bits.
3747      This is equivalent to half the value returned by CNTD with a
3748      vector shape of ALL.  */
3749   HOST_WIDE_INT factor = offset.coeffs[1];
3750   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3751
3752   /* Try using ADDVL or ADDPL to add the VG-based part.  */
3753   poly_int64 poly_offset (factor, factor);
3754   if (src != const0_rtx
3755       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3756     {
3757       rtx offset_rtx = gen_int_mode (poly_offset, mode);
3758       if (frame_related_p)
3759         {
3760           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3761           RTX_FRAME_RELATED_P (insn) = true;
3762           src = dest;
3763         }
3764       else
3765         {
3766           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
3767           src = aarch64_force_temporary (mode, temp1, addr);
3768           temp1 = temp2;
3769           temp2 = NULL_RTX;
3770         }
3771     }
3772   /* Otherwise use a CNT-based sequence.  */
3773   else if (factor != 0)
3774     {
3775       /* Use a subtraction if we have a negative factor.  */
3776       rtx_code code = PLUS;
3777       if (factor < 0)
3778         {
3779           factor = -factor;
3780           code = MINUS;
3781         }
3782
3783       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
3784          into the multiplication.  */
3785       rtx val;
3786       int shift = 0;
3787       if (factor & 1)
3788         /* Use a right shift by 1.  */
3789         shift = -1;
3790       else
3791         factor /= 2;
3792       HOST_WIDE_INT low_bit = factor & -factor;
3793       if (factor <= 16 * low_bit)
3794         {
3795           if (factor > 16 * 8)
3796             {
3797               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3798                  the value with the minimum multiplier and shift it into
3799                  position.  */
3800               int extra_shift = exact_log2 (low_bit);
3801               shift += extra_shift;
3802               factor >>= extra_shift;
3803             }
3804           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3805         }
3806       else
3807         {
3808           /* Base the factor on LOW_BIT if we can calculate LOW_BIT
3809              directly, since that should increase the chances of being
3810              able to use a shift and add sequence.  If LOW_BIT itself
3811              is out of range, just use CNTD.  */
3812           if (low_bit <= 16 * 8)
3813             factor /= low_bit;
3814           else
3815             low_bit = 1;
3816
3817           val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
3818           val = aarch64_force_temporary (mode, temp1, val);
3819
3820           if (can_create_pseudo_p ())
3821             {
3822               rtx coeff1 = gen_int_mode (factor, mode);
3823               val = expand_mult (mode, val, coeff1, NULL_RTX, false, true);
3824             }
3825           else
3826             {
3827               /* Go back to using a negative multiplication factor if we have
3828                  no register from which to subtract.  */
3829               if (code == MINUS && src == const0_rtx)
3830                 {
3831                   factor = -factor;
3832                   code = PLUS;
3833                 }
3834               rtx coeff1 = gen_int_mode (factor, mode);
3835               coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3836               val = gen_rtx_MULT (mode, val, coeff1);
3837             }
3838         }
3839
3840       if (shift > 0)
3841         {
3842           /* Multiply by 1 << SHIFT.  */
3843           val = aarch64_force_temporary (mode, temp1, val);
3844           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3845         }
3846       else if (shift == -1)
3847         {
3848           /* Divide by 2.  */
3849           val = aarch64_force_temporary (mode, temp1, val);
3850           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3851         }
3852
3853       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
3854       if (src != const0_rtx)
3855         {
3856           val = aarch64_force_temporary (mode, temp1, val);
3857           val = gen_rtx_fmt_ee (code, mode, src, val);
3858         }
3859       else if (code == MINUS)
3860         {
3861           val = aarch64_force_temporary (mode, temp1, val);
3862           val = gen_rtx_NEG (mode, val);
3863         }
3864
3865       if (constant == 0 || frame_related_p)
3866         {
3867           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3868           if (frame_related_p)
3869             {
3870               RTX_FRAME_RELATED_P (insn) = true;
3871               add_reg_note (insn, REG_CFA_ADJUST_CFA,
3872                             gen_rtx_SET (dest, plus_constant (Pmode, src,
3873                                                               poly_offset)));
3874             }
3875           src = dest;
3876           if (constant == 0)
3877             return;
3878         }
3879       else
3880         {
3881           src = aarch64_force_temporary (mode, temp1, val);
3882           temp1 = temp2;
3883           temp2 = NULL_RTX;
3884         }
3885
3886       emit_move_imm = true;
3887     }
3888
3889   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3890                         frame_related_p, emit_move_imm);
3891 }
3892
3893 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3894    than a poly_int64.  */
3895
3896 void
3897 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3898                           rtx offset_rtx, rtx temp1, rtx temp2)
3899 {
3900   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3901                       temp1, temp2, false);
3902 }
3903
3904 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3905    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
3906    if TEMP1 already contains abs (DELTA).  */
3907
3908 static inline void
3909 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3910 {
3911   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3912                       temp1, temp2, true, emit_move_imm);
3913 }
3914
3915 /* Subtract DELTA from the stack pointer, marking the instructions
3916    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
3917    if nonnull.  */
3918
3919 static inline void
3920 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3921                 bool emit_move_imm = true)
3922 {
3923   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3924                       temp1, temp2, frame_related_p, emit_move_imm);
3925 }
3926
3927 /* Set DEST to (vec_series BASE STEP).  */
3928
3929 static void
3930 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3931 {
3932   machine_mode mode = GET_MODE (dest);
3933   scalar_mode inner = GET_MODE_INNER (mode);
3934
3935   /* Each operand can be a register or an immediate in the range [-16, 15].  */
3936   if (!aarch64_sve_index_immediate_p (base))
3937     base = force_reg (inner, base);
3938   if (!aarch64_sve_index_immediate_p (step))
3939     step = force_reg (inner, step);
3940
3941   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3942 }
3943
3944 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
3945    register of mode MODE.  Use TARGET for the result if it's nonnull
3946    and convenient.
3947
3948    The two vector modes must have the same element mode.  The behavior
3949    is to duplicate architectural lane N of SRC into architectural lanes
3950    N + I * STEP of the result.  On big-endian targets, architectural
3951    lane 0 of an Advanced SIMD vector is the last element of the vector
3952    in memory layout, so for big-endian targets this operation has the
3953    effect of reversing SRC before duplicating it.  Callers need to
3954    account for this.  */
3955
3956 rtx
3957 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
3958 {
3959   machine_mode src_mode = GET_MODE (src);
3960   gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
3961   insn_code icode = (BYTES_BIG_ENDIAN
3962                      ? code_for_aarch64_vec_duplicate_vq_be (mode)
3963                      : code_for_aarch64_vec_duplicate_vq_le (mode));
3964
3965   unsigned int i = 0;
3966   expand_operand ops[3];
3967   create_output_operand (&ops[i++], target, mode);
3968   create_output_operand (&ops[i++], src, src_mode);
3969   if (BYTES_BIG_ENDIAN)
3970     {
3971       /* Create a PARALLEL describing the reversal of SRC.  */
3972       unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
3973       rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
3974                                                   nelts_per_vq - 1, -1);
3975       create_fixed_operand (&ops[i++], sel);
3976     }
3977   expand_insn (icode, i, ops);
3978   return ops[0].value;
3979 }
3980
3981 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
3982    the memory image into DEST.  Return true on success.  */
3983
3984 static bool
3985 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
3986 {
3987   src = force_const_mem (GET_MODE (src), src);
3988   if (!src)
3989     return false;
3990
3991   /* Make sure that the address is legitimate.  */
3992   if (!aarch64_sve_ld1rq_operand_p (src))
3993     {
3994       rtx addr = force_reg (Pmode, XEXP (src, 0));
3995       src = replace_equiv_address (src, addr);
3996     }
3997
3998   machine_mode mode = GET_MODE (dest);
3999   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
4000   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
4001   rtx ptrue = aarch64_ptrue_reg (pred_mode);
4002   emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
4003   return true;
4004 }
4005
4006 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
4007    SVE data mode and isn't a legitimate constant.  Use TARGET for the
4008    result if convenient.
4009
4010    The returned register can have whatever mode seems most natural
4011    given the contents of SRC.  */
4012
4013 static rtx
4014 aarch64_expand_sve_const_vector (rtx target, rtx src)
4015 {
4016   machine_mode mode = GET_MODE (src);
4017   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
4018   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
4019   scalar_mode elt_mode = GET_MODE_INNER (mode);
4020   unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
4021   unsigned int encoded_bits = npatterns * nelts_per_pattern * elt_bits;
4022
4023   if (nelts_per_pattern == 1 && encoded_bits == 128)
4024     {
4025       /* The constant is a duplicated quadword but can't be narrowed
4026          beyond a quadword.  Get the memory image of the first quadword
4027          as a 128-bit vector and try using LD1RQ to load it from memory.
4028
4029          The effect for both endiannesses is to load memory lane N into
4030          architectural lanes N + I * STEP of the result.  On big-endian
4031          targets, the layout of the 128-bit vector in an Advanced SIMD
4032          register would be different from its layout in an SVE register,
4033          but this 128-bit vector is a memory value only.  */
4034       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
4035       rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
4036       if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
4037         return target;
4038     }
4039
4040   if (nelts_per_pattern == 1 && encoded_bits < 128)
4041     {
4042       /* The vector is a repeating sequence of 64 bits or fewer.
4043          See if we can load them using an Advanced SIMD move and then
4044          duplicate it to fill a vector.  This is better than using a GPR
4045          move because it keeps everything in the same register file.  */
4046       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
4047       rtx_vector_builder builder (vq_mode, npatterns, 1);
4048       for (unsigned int i = 0; i < npatterns; ++i)
4049         {
4050           /* We want memory lane N to go into architectural lane N,
4051              so reverse for big-endian targets.  The DUP .Q pattern
4052              has a compensating reverse built-in.  */
4053           unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
4054           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
4055         }
4056       rtx vq_src = builder.build ();
4057       if (aarch64_simd_valid_immediate (vq_src, NULL))
4058         {
4059           vq_src = force_reg (vq_mode, vq_src);
4060           return aarch64_expand_sve_dupq (target, mode, vq_src);
4061         }
4062
4063       /* Get an integer representation of the repeating part of Advanced
4064          SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
4065          which for big-endian targets is lane-swapped wrt a normal
4066          Advanced SIMD vector.  This means that for both endiannesses,
4067          memory lane N of SVE vector SRC corresponds to architectural
4068          lane N of a register holding VQ_SRC.  This in turn means that
4069          memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
4070          as a single 128-bit value) and thus that memory lane 0 of SRC is
4071          in the lsb of the integer.  Duplicating the integer therefore
4072          ensures that memory lane N of SRC goes into architectural lane
4073          N + I * INDEX of the SVE register.  */
4074       scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
4075       rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
4076       if (elt_value)
4077         {
4078           /* Pretend that we had a vector of INT_MODE to start with.  */
4079           elt_mode = int_mode;
4080           mode = aarch64_full_sve_mode (int_mode).require ();
4081
4082           /* If the integer can be moved into a general register by a
4083              single instruction, do that and duplicate the result.  */
4084           if (CONST_INT_P (elt_value)
4085               && aarch64_move_imm (INTVAL (elt_value), elt_mode))
4086             {
4087               elt_value = force_reg (elt_mode, elt_value);
4088               return expand_vector_broadcast (mode, elt_value);
4089             }
4090         }
4091       else if (npatterns == 1)
4092         /* We're duplicating a single value, but can't do better than
4093            force it to memory and load from there.  This handles things
4094            like symbolic constants.  */
4095         elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
4096
4097       if (elt_value)
4098         {
4099           /* Load the element from memory if we can, otherwise move it into
4100              a register and use a DUP.  */
4101           rtx op = force_const_mem (elt_mode, elt_value);
4102           if (!op)
4103             op = force_reg (elt_mode, elt_value);
4104           return expand_vector_broadcast (mode, op);
4105         }
4106     }
4107
4108   /* Try using INDEX.  */
4109   rtx base, step;
4110   if (const_vec_series_p (src, &base, &step))
4111     {
4112       aarch64_expand_vec_series (target, base, step);
4113       return target;
4114     }
4115
4116   /* From here on, it's better to force the whole constant to memory
4117      if we can.  */
4118   if (GET_MODE_NUNITS (mode).is_constant ())
4119     return NULL_RTX;
4120
4121   /* Expand each pattern individually.  */
4122   gcc_assert (npatterns > 1);
4123   rtx_vector_builder builder;
4124   auto_vec<rtx, 16> vectors (npatterns);
4125   for (unsigned int i = 0; i < npatterns; ++i)
4126     {
4127       builder.new_vector (mode, 1, nelts_per_pattern);
4128       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
4129         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
4130       vectors.quick_push (force_reg (mode, builder.build ()));
4131     }
4132
4133   /* Use permutes to interleave the separate vectors.  */
4134   while (npatterns > 1)
4135     {
4136       npatterns /= 2;
4137       for (unsigned int i = 0; i < npatterns; ++i)
4138         {
4139           rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
4140           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
4141           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
4142           vectors[i] = tmp;
4143         }
4144     }
4145   gcc_assert (vectors[0] == target);
4146   return target;
4147 }
4148
4149 /* Use WHILE to set a predicate register of mode MODE in which the first
4150    VL bits are set and the rest are clear.  Use TARGET for the register
4151    if it's nonnull and convenient.  */
4152
4153 static rtx
4154 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
4155                                  unsigned int vl)
4156 {
4157   rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
4158   target = aarch64_target_reg (target, mode);
4159   emit_insn (gen_while (UNSPEC_WHILE_LO, DImode, mode,
4160                         target, const0_rtx, limit));
4161   return target;
4162 }
4163
4164 static rtx
4165 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
4166
4167 /* BUILDER is a constant predicate in which the index of every set bit
4168    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
4169    by inverting every element at a multiple of ELT_SIZE and EORing the
4170    result with an ELT_SIZE PTRUE.
4171
4172    Return a register that contains the constant on success, otherwise
4173    return null.  Use TARGET as the register if it is nonnull and
4174    convenient.  */
4175
4176 static rtx
4177 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
4178                                    unsigned int elt_size)
4179 {
4180   /* Invert every element at a multiple of ELT_SIZE, keeping the
4181      other bits zero.  */
4182   rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
4183                                   builder.nelts_per_pattern ());
4184   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4185     if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
4186       inv_builder.quick_push (const1_rtx);
4187     else
4188       inv_builder.quick_push (const0_rtx);
4189   inv_builder.finalize ();
4190
4191   /* See if we can load the constant cheaply.  */
4192   rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
4193   if (!inv)
4194     return NULL_RTX;
4195
4196   /* EOR the result with an ELT_SIZE PTRUE.  */
4197   rtx mask = aarch64_ptrue_all (elt_size);
4198   mask = force_reg (VNx16BImode, mask);
4199   target = aarch64_target_reg (target, VNx16BImode);
4200   emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
4201   return target;
4202 }
4203
4204 /* BUILDER is a constant predicate in which the index of every set bit
4205    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
4206    using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE.  Return the
4207    register on success, otherwise return null.  Use TARGET as the register
4208    if nonnull and convenient.  */
4209
4210 static rtx
4211 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
4212                                    unsigned int elt_size,
4213                                    unsigned int permute_size)
4214 {
4215   /* We're going to split the constant into two new constants A and B,
4216      with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
4217      and into B otherwise.  E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
4218
4219      A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
4220      B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
4221
4222      where _ indicates elements that will be discarded by the permute.
4223
4224      First calculate the ELT_SIZEs for A and B.  */
4225   unsigned int a_elt_size = GET_MODE_SIZE (DImode);
4226   unsigned int b_elt_size = GET_MODE_SIZE (DImode);
4227   for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
4228     if (INTVAL (builder.elt (i)) != 0)
4229       {
4230         if (i & permute_size)
4231           b_elt_size |= i - permute_size;
4232         else
4233           a_elt_size |= i;
4234       }
4235   a_elt_size &= -a_elt_size;
4236   b_elt_size &= -b_elt_size;
4237
4238   /* Now construct the vectors themselves.  */
4239   rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
4240                                 builder.nelts_per_pattern ());
4241   rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
4242                                 builder.nelts_per_pattern ());
4243   unsigned int nelts = builder.encoded_nelts ();
4244   for (unsigned int i = 0; i < nelts; ++i)
4245     if (i & (elt_size - 1))
4246       {
4247         a_builder.quick_push (const0_rtx);
4248         b_builder.quick_push (const0_rtx);
4249       }
4250     else if ((i & permute_size) == 0)
4251       {
4252         /* The A and B elements are significant.  */
4253         a_builder.quick_push (builder.elt (i));
4254         b_builder.quick_push (builder.elt (i + permute_size));
4255       }
4256     else
4257       {
4258         /* The A and B elements are going to be discarded, so pick whatever
4259            is likely to give a nice constant.  We are targeting element
4260            sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
4261            with the aim of each being a sequence of ones followed by
4262            a sequence of zeros.  So:
4263
4264            * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
4265              duplicate the last X_ELT_SIZE element, to extend the
4266              current sequence of ones or zeros.
4267
4268            * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
4269              zero, so that the constant really does have X_ELT_SIZE and
4270              not a smaller size.  */
4271         if (a_elt_size > permute_size)
4272           a_builder.quick_push (const0_rtx);
4273         else
4274           a_builder.quick_push (a_builder.elt (i - a_elt_size));
4275         if (b_elt_size > permute_size)
4276           b_builder.quick_push (const0_rtx);
4277         else
4278           b_builder.quick_push (b_builder.elt (i - b_elt_size));
4279       }
4280   a_builder.finalize ();
4281   b_builder.finalize ();
4282
4283   /* Try loading A into a register.  */
4284   rtx_insn *last = get_last_insn ();
4285   rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
4286   if (!a)
4287     return NULL_RTX;
4288
4289   /* Try loading B into a register.  */
4290   rtx b = a;
4291   if (a_builder != b_builder)
4292     {
4293       b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
4294       if (!b)
4295         {
4296           delete_insns_since (last);
4297           return NULL_RTX;
4298         }
4299     }
4300
4301   /* Emit the TRN1 itself.  */
4302   machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
4303   target = aarch64_target_reg (target, mode);
4304   emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
4305                               gen_lowpart (mode, a),
4306                               gen_lowpart (mode, b)));
4307   return target;
4308 }
4309
4310 /* Subroutine of aarch64_expand_sve_const_pred.  Try to load the VNx16BI
4311    constant in BUILDER into an SVE predicate register.  Return the register
4312    on success, otherwise return null.  Use TARGET for the register if
4313    nonnull and convenient.
4314
4315    ALLOW_RECURSE_P is true if we can use methods that would call this
4316    function recursively.  */
4317
4318 static rtx
4319 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
4320                                  bool allow_recurse_p)
4321 {
4322   if (builder.encoded_nelts () == 1)
4323     /* A PFALSE or a PTRUE .B ALL.  */
4324     return aarch64_emit_set_immediate (target, builder);
4325
4326   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
4327   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
4328     {
4329       /* If we can load the constant using PTRUE, use it as-is.  */
4330       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
4331       if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
4332         return aarch64_emit_set_immediate (target, builder);
4333
4334       /* Otherwise use WHILE to set the first VL bits.  */
4335       return aarch64_sve_move_pred_via_while (target, mode, vl);
4336     }
4337
4338   if (!allow_recurse_p)
4339     return NULL_RTX;
4340
4341   /* Try inverting the vector in element size ELT_SIZE and then EORing
4342      the result with an ELT_SIZE PTRUE.  */
4343   if (INTVAL (builder.elt (0)) == 0)
4344     if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
4345                                                      elt_size))
4346       return res;
4347
4348   /* Try using TRN1 to permute two simpler constants.  */
4349   for (unsigned int i = elt_size; i <= 8; i *= 2)
4350     if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
4351                                                      elt_size, i))
4352       return res;
4353
4354   return NULL_RTX;
4355 }
4356
4357 /* Return an SVE predicate register that contains the VNx16BImode
4358    constant in BUILDER, without going through the move expanders.
4359
4360    The returned register can have whatever mode seems most natural
4361    given the contents of BUILDER.  Use TARGET for the result if
4362    convenient.  */
4363
4364 static rtx
4365 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
4366 {
4367   /* Try loading the constant using pure predicate operations.  */
4368   if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
4369     return res;
4370
4371   /* Try forcing the constant to memory.  */
4372   if (builder.full_nelts ().is_constant ())
4373     if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
4374       {
4375         target = aarch64_target_reg (target, VNx16BImode);
4376         emit_move_insn (target, mem);
4377         return target;
4378       }
4379
4380   /* The last resort is to load the constant as an integer and then
4381      compare it against zero.  Use -1 for set bits in order to increase
4382      the changes of using SVE DUPM or an Advanced SIMD byte mask.  */
4383   rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
4384                                   builder.nelts_per_pattern ());
4385   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4386     int_builder.quick_push (INTVAL (builder.elt (i))
4387                             ? constm1_rtx : const0_rtx);
4388   return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
4389                                            int_builder.build ());
4390 }
4391
4392 /* Set DEST to immediate IMM.  */
4393
4394 void
4395 aarch64_expand_mov_immediate (rtx dest, rtx imm)
4396 {
4397   machine_mode mode = GET_MODE (dest);
4398
4399   /* Check on what type of symbol it is.  */
4400   scalar_int_mode int_mode;
4401   if ((GET_CODE (imm) == SYMBOL_REF
4402        || GET_CODE (imm) == LABEL_REF
4403        || GET_CODE (imm) == CONST
4404        || GET_CODE (imm) == CONST_POLY_INT)
4405       && is_a <scalar_int_mode> (mode, &int_mode))
4406     {
4407       rtx mem;
4408       poly_int64 offset;
4409       HOST_WIDE_INT const_offset;
4410       enum aarch64_symbol_type sty;
4411
4412       /* If we have (const (plus symbol offset)), separate out the offset
4413          before we start classifying the symbol.  */
4414       rtx base = strip_offset (imm, &offset);
4415
4416       /* We must always add an offset involving VL separately, rather than
4417          folding it into the relocation.  */
4418       if (!offset.is_constant (&const_offset))
4419         {
4420           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
4421             emit_insn (gen_rtx_SET (dest, imm));
4422           else
4423             {
4424               /* Do arithmetic on 32-bit values if the result is smaller
4425                  than that.  */
4426               if (partial_subreg_p (int_mode, SImode))
4427                 {
4428                   /* It is invalid to do symbol calculations in modes
4429                      narrower than SImode.  */
4430                   gcc_assert (base == const0_rtx);
4431                   dest = gen_lowpart (SImode, dest);
4432                   int_mode = SImode;
4433                 }
4434               if (base != const0_rtx)
4435                 {
4436                   base = aarch64_force_temporary (int_mode, dest, base);
4437                   aarch64_add_offset (int_mode, dest, base, offset,
4438                                       NULL_RTX, NULL_RTX, false);
4439                 }
4440               else
4441                 aarch64_add_offset (int_mode, dest, base, offset,
4442                                     dest, NULL_RTX, false);
4443             }
4444           return;
4445         }
4446
4447       sty = aarch64_classify_symbol (base, const_offset);
4448       switch (sty)
4449         {
4450         case SYMBOL_FORCE_TO_MEM:
4451           if (const_offset != 0
4452               && targetm.cannot_force_const_mem (int_mode, imm))
4453             {
4454               gcc_assert (can_create_pseudo_p ());
4455               base = aarch64_force_temporary (int_mode, dest, base);
4456               aarch64_add_offset (int_mode, dest, base, const_offset,
4457                                   NULL_RTX, NULL_RTX, false);
4458               return;
4459             }
4460
4461           mem = force_const_mem (ptr_mode, imm);
4462           gcc_assert (mem);
4463
4464           /* If we aren't generating PC relative literals, then
4465              we need to expand the literal pool access carefully.
4466              This is something that needs to be done in a number
4467              of places, so could well live as a separate function.  */
4468           if (!aarch64_pcrelative_literal_loads)
4469             {
4470               gcc_assert (can_create_pseudo_p ());
4471               base = gen_reg_rtx (ptr_mode);
4472               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
4473               if (ptr_mode != Pmode)
4474                 base = convert_memory_address (Pmode, base);
4475               mem = gen_rtx_MEM (ptr_mode, base);
4476             }
4477
4478           if (int_mode != ptr_mode)
4479             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
4480
4481           emit_insn (gen_rtx_SET (dest, mem));
4482
4483           return;
4484
4485         case SYMBOL_SMALL_TLSGD:
4486         case SYMBOL_SMALL_TLSDESC:
4487         case SYMBOL_SMALL_TLSIE:
4488         case SYMBOL_SMALL_GOT_28K:
4489         case SYMBOL_SMALL_GOT_4G:
4490         case SYMBOL_TINY_GOT:
4491         case SYMBOL_TINY_TLSIE:
4492           if (const_offset != 0)
4493             {
4494               gcc_assert(can_create_pseudo_p ());
4495               base = aarch64_force_temporary (int_mode, dest, base);
4496               aarch64_add_offset (int_mode, dest, base, const_offset,
4497                                   NULL_RTX, NULL_RTX, false);
4498               return;
4499             }
4500           /* FALLTHRU */
4501
4502         case SYMBOL_SMALL_ABSOLUTE:
4503         case SYMBOL_TINY_ABSOLUTE:
4504         case SYMBOL_TLSLE12:
4505         case SYMBOL_TLSLE24:
4506         case SYMBOL_TLSLE32:
4507         case SYMBOL_TLSLE48:
4508           aarch64_load_symref_appropriately (dest, imm, sty);
4509           return;
4510
4511         default:
4512           gcc_unreachable ();
4513         }
4514     }
4515
4516   if (!CONST_INT_P (imm))
4517     {
4518       if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
4519         {
4520           /* Only the low bit of each .H, .S and .D element is defined,
4521              so we can set the upper bits to whatever we like.  If the
4522              predicate is all-true in MODE, prefer to set all the undefined
4523              bits as well, so that we can share a single .B predicate for
4524              all modes.  */
4525           if (imm == CONSTM1_RTX (mode))
4526             imm = CONSTM1_RTX (VNx16BImode);
4527
4528           /* All methods for constructing predicate modes wider than VNx16BI
4529              will set the upper bits of each element to zero.  Expose this
4530              by moving such constants as a VNx16BI, so that all bits are
4531              significant and so that constants for different modes can be
4532              shared.  The wider constant will still be available as a
4533              REG_EQUAL note.  */
4534           rtx_vector_builder builder;
4535           if (aarch64_get_sve_pred_bits (builder, imm))
4536             {
4537               rtx res = aarch64_expand_sve_const_pred (dest, builder);
4538               if (dest != res)
4539                 emit_move_insn (dest, gen_lowpart (mode, res));
4540               return;
4541             }
4542         }
4543
4544       if (GET_CODE (imm) == HIGH
4545           || aarch64_simd_valid_immediate (imm, NULL))
4546         {
4547           emit_insn (gen_rtx_SET (dest, imm));
4548           return;
4549         }
4550
4551       if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
4552         if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
4553           {
4554             if (dest != res)
4555               emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
4556             return;
4557           }
4558
4559       rtx mem = force_const_mem (mode, imm);
4560       gcc_assert (mem);
4561       emit_move_insn (dest, mem);
4562       return;
4563     }
4564
4565   aarch64_internal_mov_immediate (dest, imm, true,
4566                                   as_a <scalar_int_mode> (mode));
4567 }
4568
4569 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
4570    that is known to contain PTRUE.  */
4571
4572 void
4573 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
4574 {
4575   expand_operand ops[3];
4576   machine_mode mode = GET_MODE (dest);
4577   create_output_operand (&ops[0], dest, mode);
4578   create_input_operand (&ops[1], pred, GET_MODE(pred));
4579   create_input_operand (&ops[2], src, mode);
4580   temporary_volatile_ok v (true);
4581   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
4582 }
4583
4584 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
4585    operand is in memory.  In this case we need to use the predicated LD1
4586    and ST1 instead of LDR and STR, both for correctness on big-endian
4587    targets and because LD1 and ST1 support a wider range of addressing modes.
4588    PRED_MODE is the mode of the predicate.
4589
4590    See the comment at the head of aarch64-sve.md for details about the
4591    big-endian handling.  */
4592
4593 void
4594 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
4595 {
4596   machine_mode mode = GET_MODE (dest);
4597   rtx ptrue = aarch64_ptrue_reg (pred_mode);
4598   if (!register_operand (src, mode)
4599       && !register_operand (dest, mode))
4600     {
4601       rtx tmp = gen_reg_rtx (mode);
4602       if (MEM_P (src))
4603         aarch64_emit_sve_pred_move (tmp, ptrue, src);
4604       else
4605         emit_move_insn (tmp, src);
4606       src = tmp;
4607     }
4608   aarch64_emit_sve_pred_move (dest, ptrue, src);
4609 }
4610
4611 /* Called only on big-endian targets.  See whether an SVE vector move
4612    from SRC to DEST is effectively a REV[BHW] instruction, because at
4613    least one operand is a subreg of an SVE vector that has wider or
4614    narrower elements.  Return true and emit the instruction if so.
4615
4616    For example:
4617
4618      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4619
4620    represents a VIEW_CONVERT between the following vectors, viewed
4621    in memory order:
4622
4623      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
4624      R1: { [0],      [1],      [2],      [3],     ... }
4625
4626    The high part of lane X in R2 should therefore correspond to lane X*2
4627    of R1, but the register representations are:
4628
4629          msb                                      lsb
4630      R2: ...... [1].high  [1].low   [0].high  [0].low
4631      R1: ...... [3]       [2]       [1]       [0]
4632
4633    where the low part of lane X in R2 corresponds to lane X*2 in R1.
4634    We therefore need a reverse operation to swap the high and low values
4635    around.
4636
4637    This is purely an optimization.  Without it we would spill the
4638    subreg operand to the stack in one mode and reload it in the
4639    other mode, which has the same effect as the REV.  */
4640
4641 bool
4642 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
4643 {
4644   gcc_assert (BYTES_BIG_ENDIAN);
4645   if (GET_CODE (dest) == SUBREG)
4646     dest = SUBREG_REG (dest);
4647   if (GET_CODE (src) == SUBREG)
4648     src = SUBREG_REG (src);
4649
4650   /* The optimization handles two single SVE REGs with different element
4651      sizes.  */
4652   if (!REG_P (dest)
4653       || !REG_P (src)
4654       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
4655       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
4656       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
4657           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
4658     return false;
4659
4660   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
4661   rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
4662   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
4663                                UNSPEC_REV_SUBREG);
4664   emit_insn (gen_rtx_SET (dest, unspec));
4665   return true;
4666 }
4667
4668 /* Return a copy of X with mode MODE, without changing its other
4669    attributes.  Unlike gen_lowpart, this doesn't care whether the
4670    mode change is valid.  */
4671
4672 rtx
4673 aarch64_replace_reg_mode (rtx x, machine_mode mode)
4674 {
4675   if (GET_MODE (x) == mode)
4676     return x;
4677
4678   x = shallow_copy_rtx (x);
4679   set_mode_and_regno (x, mode, REGNO (x));
4680   return x;
4681 }
4682
4683 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
4684    stored in wider integer containers.  */
4685
4686 static unsigned int
4687 aarch64_sve_rev_unspec (machine_mode mode)
4688 {
4689   switch (GET_MODE_UNIT_SIZE (mode))
4690     {
4691     case 1: return UNSPEC_REVB;
4692     case 2: return UNSPEC_REVH;
4693     case 4: return UNSPEC_REVW;
4694     }
4695   gcc_unreachable ();
4696 }
4697
4698 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4699    operands.  */
4700
4701 void
4702 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
4703 {
4704   /* Decide which REV operation we need.  The mode with wider elements
4705      determines the mode of the operands and the mode with the narrower
4706      elements determines the reverse width.  */
4707   machine_mode mode_with_wider_elts = GET_MODE (dest);
4708   machine_mode mode_with_narrower_elts = GET_MODE (src);
4709   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
4710       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
4711     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
4712
4713   unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
4714   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
4715   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
4716
4717   /* Get the operands in the appropriate modes and emit the instruction.  */
4718   ptrue = gen_lowpart (pred_mode, ptrue);
4719   dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
4720   src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
4721   emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
4722                                dest, ptrue, src));
4723 }
4724
4725 static bool
4726 aarch64_function_ok_for_sibcall (tree, tree exp)
4727 {
4728   if (crtl->abi->id () != expr_callee_abi (exp).id ())
4729     return false;
4730
4731   return true;
4732 }
4733
4734 /* Implement TARGET_PASS_BY_REFERENCE.  */
4735
4736 static bool
4737 aarch64_pass_by_reference (cumulative_args_t pcum_v,
4738                            const function_arg_info &arg)
4739 {
4740   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4741   HOST_WIDE_INT size;
4742   machine_mode dummymode;
4743   int nregs;
4744
4745   unsigned int num_zr, num_pr;
4746   if (arg.type && aarch64_sve_argument_p (arg.type, &num_zr, &num_pr))
4747     {
4748       if (pcum && !pcum->silent_p && !TARGET_SVE)
4749         /* We can't gracefully recover at this point, so make this a
4750            fatal error.  */
4751         fatal_error (input_location, "arguments of type %qT require"
4752                      " the SVE ISA extension", arg.type);
4753
4754       /* Variadic SVE types are passed by reference.  Normal non-variadic
4755          arguments are too if we've run out of registers.  */
4756       return (!arg.named
4757               || pcum->aapcs_nvrn + num_zr > NUM_FP_ARG_REGS
4758               || pcum->aapcs_nprn + num_pr > NUM_PR_ARG_REGS);
4759     }
4760
4761   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
4762   if (arg.mode == BLKmode && arg.type)
4763     size = int_size_in_bytes (arg.type);
4764   else
4765     /* No frontends can create types with variable-sized modes, so we
4766        shouldn't be asked to pass or return them.  */
4767     size = GET_MODE_SIZE (arg.mode).to_constant ();
4768
4769   /* Aggregates are passed by reference based on their size.  */
4770   if (arg.aggregate_type_p ())
4771     size = int_size_in_bytes (arg.type);
4772
4773   /* Variable sized arguments are always returned by reference.  */
4774   if (size < 0)
4775     return true;
4776
4777   /* Can this be a candidate to be passed in fp/simd register(s)?  */
4778   if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
4779                                                &dummymode, &nregs,
4780                                                NULL))
4781     return false;
4782
4783   /* Arguments which are variable sized or larger than 2 registers are
4784      passed by reference unless they are a homogenous floating point
4785      aggregate.  */
4786   return size > 2 * UNITS_PER_WORD;
4787 }
4788
4789 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
4790 static bool
4791 aarch64_return_in_msb (const_tree valtype)
4792 {
4793   machine_mode dummy_mode;
4794   int dummy_int;
4795
4796   /* Never happens in little-endian mode.  */
4797   if (!BYTES_BIG_ENDIAN)
4798     return false;
4799
4800   /* Only composite types smaller than or equal to 16 bytes can
4801      be potentially returned in registers.  */
4802   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
4803       || int_size_in_bytes (valtype) <= 0
4804       || int_size_in_bytes (valtype) > 16)
4805     return false;
4806
4807   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4808      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4809      is always passed/returned in the least significant bits of fp/simd
4810      register(s).  */
4811   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
4812                                                &dummy_mode, &dummy_int, NULL))
4813     return false;
4814
4815   return true;
4816 }
4817
4818 /* Implement TARGET_FUNCTION_VALUE.
4819    Define how to find the value returned by a function.  */
4820
4821 static rtx
4822 aarch64_function_value (const_tree type, const_tree func,
4823                         bool outgoing ATTRIBUTE_UNUSED)
4824 {
4825   machine_mode mode;
4826   int unsignedp;
4827   int count;
4828   machine_mode ag_mode;
4829
4830   mode = TYPE_MODE (type);
4831   if (INTEGRAL_TYPE_P (type))
4832     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
4833
4834   unsigned int num_zr, num_pr;
4835   if (type && aarch64_sve_argument_p (type, &num_zr, &num_pr))
4836     {
4837       /* Don't raise an error here if we're called when SVE is disabled,
4838          since this is really just a query function.  Other code must
4839          do that where appropriate.  */
4840       mode = TYPE_MODE_RAW (type);
4841       gcc_assert (VECTOR_MODE_P (mode)
4842                   && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
4843
4844       if (num_zr > 0 && num_pr == 0)
4845         return gen_rtx_REG (mode, V0_REGNUM);
4846
4847       if (num_zr == 0 && num_pr == 1)
4848         return gen_rtx_REG (mode, P0_REGNUM);
4849
4850       gcc_unreachable ();
4851     }
4852
4853   /* Generic vectors that map to SVE modes with -msve-vector-bits=N are
4854      returned in memory, not by value.  */
4855   gcc_assert (!aarch64_sve_mode_p (mode));
4856
4857   if (aarch64_return_in_msb (type))
4858     {
4859       HOST_WIDE_INT size = int_size_in_bytes (type);
4860
4861       if (size % UNITS_PER_WORD != 0)
4862         {
4863           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
4864           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
4865         }
4866     }
4867
4868   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4869                                                &ag_mode, &count, NULL))
4870     {
4871       if (!aarch64_composite_type_p (type, mode))
4872         {
4873           gcc_assert (count == 1 && mode == ag_mode);
4874           return gen_rtx_REG (mode, V0_REGNUM);
4875         }
4876       else
4877         {
4878           int i;
4879           rtx par;
4880
4881           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
4882           for (i = 0; i < count; i++)
4883             {
4884               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
4885               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
4886               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4887               XVECEXP (par, 0, i) = tmp;
4888             }
4889           return par;
4890         }
4891     }
4892   else
4893     return gen_rtx_REG (mode, R0_REGNUM);
4894 }
4895
4896 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
4897    Return true if REGNO is the number of a hard register in which the values
4898    of called function may come back.  */
4899
4900 static bool
4901 aarch64_function_value_regno_p (const unsigned int regno)
4902 {
4903   /* Maximum of 16 bytes can be returned in the general registers.  Examples
4904      of 16-byte return values are: 128-bit integers and 16-byte small
4905      structures (excluding homogeneous floating-point aggregates).  */
4906   if (regno == R0_REGNUM || regno == R1_REGNUM)
4907     return true;
4908
4909   /* Up to four fp/simd registers can return a function value, e.g. a
4910      homogeneous floating-point aggregate having four members.  */
4911   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
4912     return TARGET_FLOAT;
4913
4914   return false;
4915 }
4916
4917 /* Implement TARGET_RETURN_IN_MEMORY.
4918
4919    If the type T of the result of a function is such that
4920      void func (T arg)
4921    would require that arg be passed as a value in a register (or set of
4922    registers) according to the parameter passing rules, then the result
4923    is returned in the same registers as would be used for such an
4924    argument.  */
4925
4926 static bool
4927 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
4928 {
4929   HOST_WIDE_INT size;
4930   machine_mode ag_mode;
4931   int count;
4932
4933   if (!AGGREGATE_TYPE_P (type)
4934       && TREE_CODE (type) != COMPLEX_TYPE
4935       && TREE_CODE (type) != VECTOR_TYPE)
4936     /* Simple scalar types always returned in registers.  */
4937     return false;
4938
4939   unsigned int num_zr, num_pr;
4940   if (type && aarch64_sve_argument_p (type, &num_zr, &num_pr))
4941     {
4942       /* All SVE types we support fit in registers.  For example, it isn't
4943          yet possible to define an aggregate of 9+ SVE vectors or 5+ SVE
4944          predicates.  */
4945       gcc_assert (num_zr <= NUM_FP_ARG_REGS && num_pr <= NUM_PR_ARG_REGS);
4946       return false;
4947     }
4948
4949   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
4950                                                type,
4951                                                &ag_mode,
4952                                                &count,
4953                                                NULL))
4954     return false;
4955
4956   /* Types larger than 2 registers returned in memory.  */
4957   size = int_size_in_bytes (type);
4958   return (size < 0 || size > 2 * UNITS_PER_WORD);
4959 }
4960
4961 static bool
4962 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
4963                                const_tree type, int *nregs)
4964 {
4965   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4966   return aarch64_vfp_is_call_or_return_candidate (mode,
4967                                                   type,
4968                                                   &pcum->aapcs_vfp_rmode,
4969                                                   nregs,
4970                                                   NULL);
4971 }
4972
4973 /* Given MODE and TYPE of a function argument, return the alignment in
4974    bits.  The idea is to suppress any stronger alignment requested by
4975    the user and opt for the natural alignment (specified in AAPCS64 \S
4976    4.1).  ABI_BREAK is set to true if the alignment was incorrectly
4977    calculated in versions of GCC prior to GCC-9.  This is a helper
4978    function for local use only.  */
4979
4980 static unsigned int
4981 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
4982                                 bool *abi_break)
4983 {
4984   *abi_break = false;
4985   if (!type)
4986     return GET_MODE_ALIGNMENT (mode);
4987
4988   if (integer_zerop (TYPE_SIZE (type)))
4989     return 0;
4990
4991   gcc_assert (TYPE_MODE (type) == mode);
4992
4993   if (!AGGREGATE_TYPE_P (type))
4994     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
4995
4996   if (TREE_CODE (type) == ARRAY_TYPE)
4997     return TYPE_ALIGN (TREE_TYPE (type));
4998
4999   unsigned int alignment = 0;
5000   unsigned int bitfield_alignment = 0;
5001   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5002     if (TREE_CODE (field) == FIELD_DECL)
5003       {
5004         alignment = std::max (alignment, DECL_ALIGN (field));
5005         if (DECL_BIT_FIELD_TYPE (field))
5006           bitfield_alignment
5007             = std::max (bitfield_alignment,
5008                         TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
5009       }
5010
5011   if (bitfield_alignment > alignment)
5012     {
5013       *abi_break = true;
5014       return bitfield_alignment;
5015     }
5016
5017   return alignment;
5018 }
5019
5020 /* Layout a function argument according to the AAPCS64 rules.  The rule
5021    numbers refer to the rule numbers in the AAPCS64.  */
5022
5023 static void
5024 aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
5025 {
5026   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5027   tree type = arg.type;
5028   machine_mode mode = arg.mode;
5029   int ncrn, nvrn, nregs;
5030   bool allocate_ncrn, allocate_nvrn;
5031   HOST_WIDE_INT size;
5032   bool abi_break;
5033
5034   /* We need to do this once per argument.  */
5035   if (pcum->aapcs_arg_processed)
5036     return;
5037
5038   pcum->aapcs_arg_processed = true;
5039
5040   unsigned int num_zr, num_pr;
5041   if (type && aarch64_sve_argument_p (type, &num_zr, &num_pr))
5042     {
5043       /* The PCS says that it is invalid to pass an SVE value to an
5044          unprototyped function.  There is no ABI-defined location we
5045          can return in this case, so we have no real choice but to raise
5046          an error immediately, even though this is only a query function.  */
5047       if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
5048         {
5049           gcc_assert (!pcum->silent_p);
5050           error ("SVE type %qT cannot be passed to an unprototyped function",
5051                  arg.type);
5052           /* Avoid repeating the message, and avoid tripping the assert
5053              below.  */
5054           pcum->pcs_variant = ARM_PCS_SVE;
5055         }
5056
5057       /* We would have converted the argument into pass-by-reference
5058          form if it didn't fit in registers.  */
5059       pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + num_zr;
5060       pcum->aapcs_nextnprn = pcum->aapcs_nprn + num_pr;
5061       gcc_assert (arg.named
5062                   && pcum->pcs_variant == ARM_PCS_SVE
5063                   && aarch64_sve_mode_p (mode)
5064                   && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
5065                   && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
5066
5067       if (num_zr > 0 && num_pr == 0)
5068         pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + pcum->aapcs_nvrn);
5069       else if (num_zr == 0 && num_pr == 1)
5070         pcum->aapcs_reg = gen_rtx_REG (mode, P0_REGNUM + pcum->aapcs_nprn);
5071       else
5072         gcc_unreachable ();
5073       return;
5074     }
5075
5076   /* Generic vectors that map to SVE modes with -msve-vector-bits=N are
5077      passed by reference, not by value.  */
5078   gcc_assert (!aarch64_sve_mode_p (mode));
5079
5080   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
5081   if (type)
5082     size = int_size_in_bytes (type);
5083   else
5084     /* No frontends can create types with variable-sized modes, so we
5085        shouldn't be asked to pass or return them.  */
5086     size = GET_MODE_SIZE (mode).to_constant ();
5087   size = ROUND_UP (size, UNITS_PER_WORD);
5088
5089   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
5090   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
5091                                                  mode,
5092                                                  type,
5093                                                  &nregs);
5094
5095   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
5096      The following code thus handles passing by SIMD/FP registers first.  */
5097
5098   nvrn = pcum->aapcs_nvrn;
5099
5100   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
5101      and homogenous short-vector aggregates (HVA).  */
5102   if (allocate_nvrn)
5103     {
5104       if (!pcum->silent_p && !TARGET_FLOAT)
5105         aarch64_err_no_fpadvsimd (mode);
5106
5107       if (nvrn + nregs <= NUM_FP_ARG_REGS)
5108         {
5109           pcum->aapcs_nextnvrn = nvrn + nregs;
5110           if (!aarch64_composite_type_p (type, mode))
5111             {
5112               gcc_assert (nregs == 1);
5113               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
5114             }
5115           else
5116             {
5117               rtx par;
5118               int i;
5119               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
5120               for (i = 0; i < nregs; i++)
5121                 {
5122                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
5123                                          V0_REGNUM + nvrn + i);
5124                   rtx offset = gen_int_mode
5125                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
5126                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
5127                   XVECEXP (par, 0, i) = tmp;
5128                 }
5129               pcum->aapcs_reg = par;
5130             }
5131           return;
5132         }
5133       else
5134         {
5135           /* C.3 NSRN is set to 8.  */
5136           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
5137           goto on_stack;
5138         }
5139     }
5140
5141   ncrn = pcum->aapcs_ncrn;
5142   nregs = size / UNITS_PER_WORD;
5143
5144   /* C6 - C9.  though the sign and zero extension semantics are
5145      handled elsewhere.  This is the case where the argument fits
5146      entirely general registers.  */
5147   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
5148     {
5149       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
5150
5151       /* C.8 if the argument has an alignment of 16 then the NGRN is
5152          rounded up to the next even number.  */
5153       if (nregs == 2
5154           && ncrn % 2
5155           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
5156              comparison is there because for > 16 * BITS_PER_UNIT
5157              alignment nregs should be > 2 and therefore it should be
5158              passed by reference rather than value.  */
5159           && (aarch64_function_arg_alignment (mode, type, &abi_break)
5160               == 16 * BITS_PER_UNIT))
5161         {
5162           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
5163             inform (input_location, "parameter passing for argument of type "
5164                     "%qT changed in GCC 9.1", type);
5165           ++ncrn;
5166           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
5167         }
5168
5169       /* NREGS can be 0 when e.g. an empty structure is to be passed.
5170          A reg is still generated for it, but the caller should be smart
5171          enough not to use it.  */
5172       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
5173         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
5174       else
5175         {
5176           rtx par;
5177           int i;
5178
5179           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
5180           for (i = 0; i < nregs; i++)
5181             {
5182               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
5183               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
5184                                        GEN_INT (i * UNITS_PER_WORD));
5185               XVECEXP (par, 0, i) = tmp;
5186             }
5187           pcum->aapcs_reg = par;
5188         }
5189
5190       pcum->aapcs_nextncrn = ncrn + nregs;
5191       return;
5192     }
5193
5194   /* C.11  */
5195   pcum->aapcs_nextncrn = NUM_ARG_REGS;
5196
5197   /* The argument is passed on stack; record the needed number of words for
5198      this argument and align the total size if necessary.  */
5199 on_stack:
5200   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
5201
5202   if (aarch64_function_arg_alignment (mode, type, &abi_break)
5203       == 16 * BITS_PER_UNIT)
5204     {
5205       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
5206       if (pcum->aapcs_stack_size != new_size)
5207         {
5208           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
5209             inform (input_location, "parameter passing for argument of type "
5210                     "%qT changed in GCC 9.1", type);
5211           pcum->aapcs_stack_size = new_size;
5212         }
5213     }
5214   return;
5215 }
5216
5217 /* Implement TARGET_FUNCTION_ARG.  */
5218
5219 static rtx
5220 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
5221 {
5222   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5223   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
5224               || pcum->pcs_variant == ARM_PCS_SIMD
5225               || pcum->pcs_variant == ARM_PCS_SVE);
5226
5227   if (arg.end_marker_p ())
5228     return gen_int_mode (pcum->pcs_variant, DImode);
5229
5230   aarch64_layout_arg (pcum_v, arg);
5231   return pcum->aapcs_reg;
5232 }
5233
5234 void
5235 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
5236                               const_tree fntype,
5237                               rtx libname ATTRIBUTE_UNUSED,
5238                               const_tree fndecl ATTRIBUTE_UNUSED,
5239                               unsigned n_named ATTRIBUTE_UNUSED,
5240                               bool silent_p)
5241 {
5242   pcum->aapcs_ncrn = 0;
5243   pcum->aapcs_nvrn = 0;
5244   pcum->aapcs_nprn = 0;
5245   pcum->aapcs_nextncrn = 0;
5246   pcum->aapcs_nextnvrn = 0;
5247   pcum->aapcs_nextnprn = 0;
5248   if (fntype)
5249     pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
5250   else
5251     pcum->pcs_variant = ARM_PCS_AAPCS64;
5252   pcum->aapcs_reg = NULL_RTX;
5253   pcum->aapcs_arg_processed = false;
5254   pcum->aapcs_stack_words = 0;
5255   pcum->aapcs_stack_size = 0;
5256   pcum->silent_p = silent_p;
5257
5258   if (!silent_p
5259       && !TARGET_FLOAT
5260       && fndecl && TREE_PUBLIC (fndecl)
5261       && fntype && fntype != error_mark_node)
5262     {
5263       const_tree type = TREE_TYPE (fntype);
5264       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
5265       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
5266       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
5267                                                    &mode, &nregs, NULL))
5268         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
5269     }
5270
5271   if (!silent_p
5272       && !TARGET_SVE
5273       && pcum->pcs_variant == ARM_PCS_SVE)
5274     {
5275       /* We can't gracefully recover at this point, so make this a
5276          fatal error.  */
5277       if (fndecl)
5278         fatal_error (input_location, "%qE requires the SVE ISA extension",
5279                      fndecl);
5280       else
5281         fatal_error (input_location, "calls to functions of type %qT require"
5282                      " the SVE ISA extension", fntype);
5283     }
5284 }
5285
5286 static void
5287 aarch64_function_arg_advance (cumulative_args_t pcum_v,
5288                               const function_arg_info &arg)
5289 {
5290   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5291   if (pcum->pcs_variant == ARM_PCS_AAPCS64
5292       || pcum->pcs_variant == ARM_PCS_SIMD
5293       || pcum->pcs_variant == ARM_PCS_SVE)
5294     {
5295       aarch64_layout_arg (pcum_v, arg);
5296       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
5297                   != (pcum->aapcs_stack_words != 0));
5298       pcum->aapcs_arg_processed = false;
5299       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
5300       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
5301       pcum->aapcs_nprn = pcum->aapcs_nextnprn;
5302       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
5303       pcum->aapcs_stack_words = 0;
5304       pcum->aapcs_reg = NULL_RTX;
5305     }
5306 }
5307
5308 bool
5309 aarch64_function_arg_regno_p (unsigned regno)
5310 {
5311   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
5312           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
5313 }
5314
5315 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
5316    PARM_BOUNDARY bits of alignment, but will be given anything up
5317    to STACK_BOUNDARY bits if the type requires it.  This makes sure
5318    that both before and after the layout of each argument, the Next
5319    Stacked Argument Address (NSAA) will have a minimum alignment of
5320    8 bytes.  */
5321
5322 static unsigned int
5323 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
5324 {
5325   bool abi_break;
5326   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
5327                                                            &abi_break);
5328   if (abi_break & warn_psabi)
5329     inform (input_location, "parameter passing for argument of type "
5330             "%qT changed in GCC 9.1", type);
5331
5332   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
5333 }
5334
5335 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
5336
5337 static fixed_size_mode
5338 aarch64_get_reg_raw_mode (int regno)
5339 {
5340   if (TARGET_SVE && FP_REGNUM_P (regno))
5341     /* Don't use the SVE part of the register for __builtin_apply and
5342        __builtin_return.  The SVE registers aren't used by the normal PCS,
5343        so using them there would be a waste of time.  The PCS extensions
5344        for SVE types are fundamentally incompatible with the
5345        __builtin_return/__builtin_apply interface.  */
5346     return as_a <fixed_size_mode> (V16QImode);
5347   return default_get_reg_raw_mode (regno);
5348 }
5349
5350 /* Implement TARGET_FUNCTION_ARG_PADDING.
5351
5352    Small aggregate types are placed in the lowest memory address.
5353
5354    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
5355
5356 static pad_direction
5357 aarch64_function_arg_padding (machine_mode mode, const_tree type)
5358 {
5359   /* On little-endian targets, the least significant byte of every stack
5360      argument is passed at the lowest byte address of the stack slot.  */
5361   if (!BYTES_BIG_ENDIAN)
5362     return PAD_UPWARD;
5363
5364   /* Otherwise, integral, floating-point and pointer types are padded downward:
5365      the least significant byte of a stack argument is passed at the highest
5366      byte address of the stack slot.  */
5367   if (type
5368       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
5369          || POINTER_TYPE_P (type))
5370       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
5371     return PAD_DOWNWARD;
5372
5373   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
5374   return PAD_UPWARD;
5375 }
5376
5377 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
5378
5379    It specifies padding for the last (may also be the only)
5380    element of a block move between registers and memory.  If
5381    assuming the block is in the memory, padding upward means that
5382    the last element is padded after its highest significant byte,
5383    while in downward padding, the last element is padded at the
5384    its least significant byte side.
5385
5386    Small aggregates and small complex types are always padded
5387    upwards.
5388
5389    We don't need to worry about homogeneous floating-point or
5390    short-vector aggregates; their move is not affected by the
5391    padding direction determined here.  Regardless of endianness,
5392    each element of such an aggregate is put in the least
5393    significant bits of a fp/simd register.
5394
5395    Return !BYTES_BIG_ENDIAN if the least significant byte of the
5396    register has useful data, and return the opposite if the most
5397    significant byte does.  */
5398
5399 bool
5400 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
5401                      bool first ATTRIBUTE_UNUSED)
5402 {
5403
5404   /* Small composite types are always padded upward.  */
5405   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
5406     {
5407       HOST_WIDE_INT size;
5408       if (type)
5409         size = int_size_in_bytes (type);
5410       else
5411         /* No frontends can create types with variable-sized modes, so we
5412            shouldn't be asked to pass or return them.  */
5413         size = GET_MODE_SIZE (mode).to_constant ();
5414       if (size < 2 * UNITS_PER_WORD)
5415         return true;
5416     }
5417
5418   /* Otherwise, use the default padding.  */
5419   return !BYTES_BIG_ENDIAN;
5420 }
5421
5422 static scalar_int_mode
5423 aarch64_libgcc_cmp_return_mode (void)
5424 {
5425   return SImode;
5426 }
5427
5428 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
5429
5430 /* We use the 12-bit shifted immediate arithmetic instructions so values
5431    must be multiple of (1 << 12), i.e. 4096.  */
5432 #define ARITH_FACTOR 4096
5433
5434 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
5435 #error Cannot use simple address calculation for stack probing
5436 #endif
5437
5438 /* The pair of scratch registers used for stack probing.  */
5439 #define PROBE_STACK_FIRST_REG  R9_REGNUM
5440 #define PROBE_STACK_SECOND_REG R10_REGNUM
5441
5442 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
5443    inclusive.  These are offsets from the current stack pointer.  */
5444
5445 static void
5446 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
5447 {
5448   HOST_WIDE_INT size;
5449   if (!poly_size.is_constant (&size))
5450     {
5451       sorry ("stack probes for SVE frames");
5452       return;
5453     }
5454
5455   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
5456
5457   /* See the same assertion on PROBE_INTERVAL above.  */
5458   gcc_assert ((first % ARITH_FACTOR) == 0);
5459
5460   /* See if we have a constant small number of probes to generate.  If so,
5461      that's the easy case.  */
5462   if (size <= PROBE_INTERVAL)
5463     {
5464       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
5465
5466       emit_set_insn (reg1,
5467                      plus_constant (Pmode,
5468                                     stack_pointer_rtx, -(first + base)));
5469       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
5470     }
5471
5472   /* The run-time loop is made up of 8 insns in the generic case while the
5473      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
5474   else if (size <= 4 * PROBE_INTERVAL)
5475     {
5476       HOST_WIDE_INT i, rem;
5477
5478       emit_set_insn (reg1,
5479                      plus_constant (Pmode,
5480                                     stack_pointer_rtx,
5481                                     -(first + PROBE_INTERVAL)));
5482       emit_stack_probe (reg1);
5483
5484       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
5485          it exceeds SIZE.  If only two probes are needed, this will not
5486          generate any code.  Then probe at FIRST + SIZE.  */
5487       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
5488         {
5489           emit_set_insn (reg1,
5490                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
5491           emit_stack_probe (reg1);
5492         }
5493
5494       rem = size - (i - PROBE_INTERVAL);
5495       if (rem > 256)
5496         {
5497           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5498
5499           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
5500           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
5501         }
5502       else
5503         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
5504     }
5505
5506   /* Otherwise, do the same as above, but in a loop.  Note that we must be
5507      extra careful with variables wrapping around because we might be at
5508      the very top (or the very bottom) of the address space and we have
5509      to be able to handle this case properly; in particular, we use an
5510      equality test for the loop condition.  */
5511   else
5512     {
5513       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
5514
5515       /* Step 1: round SIZE to the previous multiple of the interval.  */
5516
5517       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
5518
5519
5520       /* Step 2: compute initial and final value of the loop counter.  */
5521
5522       /* TEST_ADDR = SP + FIRST.  */
5523       emit_set_insn (reg1,
5524                      plus_constant (Pmode, stack_pointer_rtx, -first));
5525
5526       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
5527       HOST_WIDE_INT adjustment = - (first + rounded_size);
5528       if (! aarch64_uimm12_shift (adjustment))
5529         {
5530           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
5531                                           true, Pmode);
5532           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
5533         }
5534       else
5535         emit_set_insn (reg2,
5536                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
5537
5538       /* Step 3: the loop
5539
5540          do
5541            {
5542              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
5543              probe at TEST_ADDR
5544            }
5545          while (TEST_ADDR != LAST_ADDR)
5546
5547          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
5548          until it is equal to ROUNDED_SIZE.  */
5549
5550       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
5551
5552
5553       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
5554          that SIZE is equal to ROUNDED_SIZE.  */
5555
5556       if (size != rounded_size)
5557         {
5558           HOST_WIDE_INT rem = size - rounded_size;
5559
5560           if (rem > 256)
5561             {
5562               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5563
5564               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
5565               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
5566             }
5567           else
5568             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
5569         }
5570     }
5571
5572   /* Make sure nothing is scheduled before we are done.  */
5573   emit_insn (gen_blockage ());
5574 }
5575
5576 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
5577    absolute addresses.  */
5578
5579 const char *
5580 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
5581 {
5582   static int labelno = 0;
5583   char loop_lab[32];
5584   rtx xops[2];
5585
5586   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
5587
5588   /* Loop.  */
5589   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
5590
5591   HOST_WIDE_INT stack_clash_probe_interval
5592     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5593
5594   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
5595   xops[0] = reg1;
5596   HOST_WIDE_INT interval;
5597   if (flag_stack_clash_protection)
5598     interval = stack_clash_probe_interval;
5599   else
5600     interval = PROBE_INTERVAL;
5601
5602   gcc_assert (aarch64_uimm12_shift (interval));
5603   xops[1] = GEN_INT (interval);
5604
5605   output_asm_insn ("sub\t%0, %0, %1", xops);
5606
5607   /* If doing stack clash protection then we probe up by the ABI specified
5608      amount.  We do this because we're dropping full pages at a time in the
5609      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
5610   if (flag_stack_clash_protection)
5611     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
5612   else
5613     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
5614
5615   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
5616      by this amount for each iteration.  */
5617   output_asm_insn ("str\txzr, [%0, %1]", xops);
5618
5619   /* Test if TEST_ADDR == LAST_ADDR.  */
5620   xops[1] = reg2;
5621   output_asm_insn ("cmp\t%0, %1", xops);
5622
5623   /* Branch.  */
5624   fputs ("\tb.ne\t", asm_out_file);
5625   assemble_name_raw (asm_out_file, loop_lab);
5626   fputc ('\n', asm_out_file);
5627
5628   return "";
5629 }
5630
5631 /* Emit the probe loop for doing stack clash probes and stack adjustments for
5632    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
5633    of GUARD_SIZE.  When a probe is emitted it is done at most
5634    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
5635    at most MIN_PROBE_THRESHOLD.  By the end of this function
5636    BASE = BASE - ADJUSTMENT.  */
5637
5638 const char *
5639 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
5640                                       rtx min_probe_threshold, rtx guard_size)
5641 {
5642   /* This function is not allowed to use any instruction generation function
5643      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
5644      so instead emit the code you want using output_asm_insn.  */
5645   gcc_assert (flag_stack_clash_protection);
5646   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
5647   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
5648
5649   /* The minimum required allocation before the residual requires probing.  */
5650   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
5651
5652   /* Clamp the value down to the nearest value that can be used with a cmp.  */
5653   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
5654   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
5655
5656   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
5657   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
5658
5659   static int labelno = 0;
5660   char loop_start_lab[32];
5661   char loop_end_lab[32];
5662   rtx xops[2];
5663
5664   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
5665   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
5666
5667   /* Emit loop start label.  */
5668   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
5669
5670   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
5671   xops[0] = adjustment;
5672   xops[1] = probe_offset_value_rtx;
5673   output_asm_insn ("cmp\t%0, %1", xops);
5674
5675   /* Branch to end if not enough adjustment to probe.  */
5676   fputs ("\tb.lt\t", asm_out_file);
5677   assemble_name_raw (asm_out_file, loop_end_lab);
5678   fputc ('\n', asm_out_file);
5679
5680   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
5681   xops[0] = base;
5682   xops[1] = probe_offset_value_rtx;
5683   output_asm_insn ("sub\t%0, %0, %1", xops);
5684
5685   /* Probe at BASE.  */
5686   xops[1] = const0_rtx;
5687   output_asm_insn ("str\txzr, [%0, %1]", xops);
5688
5689   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
5690   xops[0] = adjustment;
5691   xops[1] = probe_offset_value_rtx;
5692   output_asm_insn ("sub\t%0, %0, %1", xops);
5693
5694   /* Branch to start if still more bytes to allocate.  */
5695   fputs ("\tb\t", asm_out_file);
5696   assemble_name_raw (asm_out_file, loop_start_lab);
5697   fputc ('\n', asm_out_file);
5698
5699   /* No probe leave.  */
5700   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
5701
5702   /* BASE = BASE - ADJUSTMENT.  */
5703   xops[0] = base;
5704   xops[1] = adjustment;
5705   output_asm_insn ("sub\t%0, %0, %1", xops);
5706   return "";
5707 }
5708
5709 /* Determine whether a frame chain needs to be generated.  */
5710 static bool
5711 aarch64_needs_frame_chain (void)
5712 {
5713   /* Force a frame chain for EH returns so the return address is at FP+8.  */
5714   if (frame_pointer_needed || crtl->calls_eh_return)
5715     return true;
5716
5717   /* A leaf function cannot have calls or write LR.  */
5718   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
5719
5720   /* Don't use a frame chain in leaf functions if leaf frame pointers
5721      are disabled.  */
5722   if (flag_omit_leaf_frame_pointer && is_leaf)
5723     return false;
5724
5725   return aarch64_use_frame_pointer;
5726 }
5727
5728 /* Mark the registers that need to be saved by the callee and calculate
5729    the size of the callee-saved registers area and frame record (both FP
5730    and LR may be omitted).  */
5731 static void
5732 aarch64_layout_frame (void)
5733 {
5734   poly_int64 offset = 0;
5735   int regno, last_fp_reg = INVALID_REGNUM;
5736   machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
5737   poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
5738   bool frame_related_fp_reg_p = false;
5739   aarch64_frame &frame = cfun->machine->frame;
5740
5741   frame.emit_frame_chain = aarch64_needs_frame_chain ();
5742
5743   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
5744      the mid-end is doing.  */
5745   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
5746
5747 #define SLOT_NOT_REQUIRED (-2)
5748 #define SLOT_REQUIRED     (-1)
5749
5750   frame.wb_candidate1 = INVALID_REGNUM;
5751   frame.wb_candidate2 = INVALID_REGNUM;
5752   frame.spare_pred_reg = INVALID_REGNUM;
5753
5754   /* First mark all the registers that really need to be saved...  */
5755   for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5756     frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5757
5758   /* ... that includes the eh data registers (if needed)...  */
5759   if (crtl->calls_eh_return)
5760     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
5761       frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
5762
5763   /* ... and any callee saved register that dataflow says is live.  */
5764   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5765     if (df_regs_ever_live_p (regno)
5766         && !fixed_regs[regno]
5767         && (regno == R30_REGNUM
5768             || !crtl->abi->clobbers_full_reg_p (regno)))
5769       frame.reg_offset[regno] = SLOT_REQUIRED;
5770
5771   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5772     if (df_regs_ever_live_p (regno)
5773         && !fixed_regs[regno]
5774         && !crtl->abi->clobbers_full_reg_p (regno))
5775       {
5776         frame.reg_offset[regno] = SLOT_REQUIRED;
5777         last_fp_reg = regno;
5778         if (aarch64_emit_cfi_for_reg_p (regno))
5779           frame_related_fp_reg_p = true;
5780       }
5781
5782   /* Big-endian SVE frames need a spare predicate register in order
5783      to save Z8-Z15.  Decide which register they should use.  Prefer
5784      an unused argument register if possible, so that we don't force P4
5785      to be saved unnecessarily.  */
5786   if (frame_related_fp_reg_p
5787       && crtl->abi->id () == ARM_PCS_SVE
5788       && BYTES_BIG_ENDIAN)
5789     {
5790       bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
5791       bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
5792       for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
5793         if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
5794           break;
5795       gcc_assert (regno <= P7_REGNUM);
5796       frame.spare_pred_reg = regno;
5797       df_set_regs_ever_live (regno, true);
5798     }
5799
5800   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
5801     if (df_regs_ever_live_p (regno)
5802         && !fixed_regs[regno]
5803         && !crtl->abi->clobbers_full_reg_p (regno))
5804       frame.reg_offset[regno] = SLOT_REQUIRED;
5805
5806   /* With stack-clash, LR must be saved in non-leaf functions.  */
5807   gcc_assert (crtl->is_leaf
5808               || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
5809
5810   /* Now assign stack slots for the registers.  Start with the predicate
5811      registers, since predicate LDR and STR have a relatively small
5812      offset range.  These saves happen below the hard frame pointer.  */
5813   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
5814     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
5815       {
5816         frame.reg_offset[regno] = offset;
5817         offset += BYTES_PER_SVE_PRED;
5818       }
5819
5820   /* We save a maximum of 8 predicate registers, and since vector
5821      registers are 8 times the size of a predicate register, all the
5822      saved predicates fit within a single vector.  Doing this also
5823      rounds the offset to a 128-bit boundary.  */
5824   if (maybe_ne (offset, 0))
5825     {
5826       gcc_assert (known_le (offset, vector_save_size));
5827       offset = vector_save_size;
5828     }
5829
5830   /* If we need to save any SVE vector registers, add them next.  */
5831   if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
5832     for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5833       if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
5834         {
5835           frame.reg_offset[regno] = offset;
5836           offset += vector_save_size;
5837         }
5838
5839   /* OFFSET is now the offset of the hard frame pointer from the bottom
5840      of the callee save area.  */
5841   bool saves_below_hard_fp_p = maybe_ne (offset, 0);
5842   frame.below_hard_fp_saved_regs_size = offset;
5843   if (frame.emit_frame_chain)
5844     {
5845       /* FP and LR are placed in the linkage record.  */
5846       frame.reg_offset[R29_REGNUM] = offset;
5847       frame.wb_candidate1 = R29_REGNUM;
5848       frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
5849       frame.wb_candidate2 = R30_REGNUM;
5850       offset += 2 * UNITS_PER_WORD;
5851     }
5852
5853   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5854     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
5855       {
5856         frame.reg_offset[regno] = offset;
5857         if (frame.wb_candidate1 == INVALID_REGNUM)
5858           frame.wb_candidate1 = regno;
5859         else if (frame.wb_candidate2 == INVALID_REGNUM)
5860           frame.wb_candidate2 = regno;
5861         offset += UNITS_PER_WORD;
5862       }
5863
5864   poly_int64 max_int_offset = offset;
5865   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5866   bool has_align_gap = maybe_ne (offset, max_int_offset);
5867
5868   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5869     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
5870       {
5871         /* If there is an alignment gap between integer and fp callee-saves,
5872            allocate the last fp register to it if possible.  */
5873         if (regno == last_fp_reg
5874             && has_align_gap
5875             && known_eq (vector_save_size, 8)
5876             && multiple_p (offset, 16))
5877           {
5878             frame.reg_offset[regno] = max_int_offset;
5879             break;
5880           }
5881
5882         frame.reg_offset[regno] = offset;
5883         if (frame.wb_candidate1 == INVALID_REGNUM)
5884           frame.wb_candidate1 = regno;
5885         else if (frame.wb_candidate2 == INVALID_REGNUM
5886                  && frame.wb_candidate1 >= V0_REGNUM)
5887           frame.wb_candidate2 = regno;
5888         offset += vector_save_size;
5889       }
5890
5891   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5892
5893   frame.saved_regs_size = offset;
5894
5895   poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
5896
5897   poly_int64 above_outgoing_args
5898     = aligned_upper_bound (varargs_and_saved_regs_size
5899                            + get_frame_size (),
5900                            STACK_BOUNDARY / BITS_PER_UNIT);
5901
5902   frame.hard_fp_offset
5903     = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
5904
5905   /* Both these values are already aligned.  */
5906   gcc_assert (multiple_p (crtl->outgoing_args_size,
5907                           STACK_BOUNDARY / BITS_PER_UNIT));
5908   frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
5909
5910   frame.locals_offset = frame.saved_varargs_size;
5911
5912   frame.initial_adjust = 0;
5913   frame.final_adjust = 0;
5914   frame.callee_adjust = 0;
5915   frame.sve_callee_adjust = 0;
5916   frame.callee_offset = 0;
5917
5918   HOST_WIDE_INT max_push_offset = 0;
5919   if (frame.wb_candidate2 != INVALID_REGNUM)
5920     max_push_offset = 512;
5921   else if (frame.wb_candidate1 != INVALID_REGNUM)
5922     max_push_offset = 256;
5923
5924   HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
5925   HOST_WIDE_INT const_saved_regs_size;
5926   if (frame.frame_size.is_constant (&const_size)
5927       && const_size < max_push_offset
5928       && known_eq (frame.hard_fp_offset, const_size))
5929     {
5930       /* Simple, small frame with no outgoing arguments:
5931
5932          stp reg1, reg2, [sp, -frame_size]!
5933          stp reg3, reg4, [sp, 16]  */
5934       frame.callee_adjust = const_size;
5935     }
5936   else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
5937            && frame.saved_regs_size.is_constant (&const_saved_regs_size)
5938            && const_outgoing_args_size + const_saved_regs_size < 512
5939            /* We could handle this case even with outgoing args, provided
5940               that the number of args left us with valid offsets for all
5941               predicate and vector save slots.  It's such a rare case that
5942               it hardly seems worth the effort though.  */
5943            && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
5944            && !(cfun->calls_alloca
5945                 && frame.hard_fp_offset.is_constant (&const_fp_offset)
5946                 && const_fp_offset < max_push_offset))
5947     {
5948       /* Frame with small outgoing arguments:
5949
5950          sub sp, sp, frame_size
5951          stp reg1, reg2, [sp, outgoing_args_size]
5952          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
5953       frame.initial_adjust = frame.frame_size;
5954       frame.callee_offset = const_outgoing_args_size;
5955     }
5956   else if (saves_below_hard_fp_p
5957            && known_eq (frame.saved_regs_size,
5958                         frame.below_hard_fp_saved_regs_size))
5959     {
5960       /* Frame in which all saves are SVE saves:
5961
5962          sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
5963          save SVE registers relative to SP
5964          sub sp, sp, outgoing_args_size  */
5965       frame.initial_adjust = (frame.hard_fp_offset
5966                               + frame.below_hard_fp_saved_regs_size);
5967       frame.final_adjust = crtl->outgoing_args_size;
5968     }
5969   else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
5970            && const_fp_offset < max_push_offset)
5971     {
5972       /* Frame with large outgoing arguments or SVE saves, but with
5973          a small local area:
5974
5975          stp reg1, reg2, [sp, -hard_fp_offset]!
5976          stp reg3, reg4, [sp, 16]
5977          [sub sp, sp, below_hard_fp_saved_regs_size]
5978          [save SVE registers relative to SP]
5979          sub sp, sp, outgoing_args_size  */
5980       frame.callee_adjust = const_fp_offset;
5981       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
5982       frame.final_adjust = crtl->outgoing_args_size;
5983     }
5984   else
5985     {
5986       /* Frame with large local area and outgoing arguments or SVE saves,
5987          using frame pointer:
5988
5989          sub sp, sp, hard_fp_offset
5990          stp x29, x30, [sp, 0]
5991          add x29, sp, 0
5992          stp reg3, reg4, [sp, 16]
5993          [sub sp, sp, below_hard_fp_saved_regs_size]
5994          [save SVE registers relative to SP]
5995          sub sp, sp, outgoing_args_size  */
5996       frame.initial_adjust = frame.hard_fp_offset;
5997       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
5998       frame.final_adjust = crtl->outgoing_args_size;
5999     }
6000
6001   /* Make sure the individual adjustments add up to the full frame size.  */
6002   gcc_assert (known_eq (frame.initial_adjust
6003                         + frame.callee_adjust
6004                         + frame.sve_callee_adjust
6005                         + frame.final_adjust, frame.frame_size));
6006
6007   frame.laid_out = true;
6008 }
6009
6010 /* Return true if the register REGNO is saved on entry to
6011    the current function.  */
6012
6013 static bool
6014 aarch64_register_saved_on_entry (int regno)
6015 {
6016   return known_ge (cfun->machine->frame.reg_offset[regno], 0);
6017 }
6018
6019 /* Return the next register up from REGNO up to LIMIT for the callee
6020    to save.  */
6021
6022 static unsigned
6023 aarch64_next_callee_save (unsigned regno, unsigned limit)
6024 {
6025   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
6026     regno ++;
6027   return regno;
6028 }
6029
6030 /* Push the register number REGNO of mode MODE to the stack with write-back
6031    adjusting the stack by ADJUSTMENT.  */
6032
6033 static void
6034 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
6035                            HOST_WIDE_INT adjustment)
6036  {
6037   rtx base_rtx = stack_pointer_rtx;
6038   rtx insn, reg, mem;
6039
6040   reg = gen_rtx_REG (mode, regno);
6041   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
6042                             plus_constant (Pmode, base_rtx, -adjustment));
6043   mem = gen_frame_mem (mode, mem);
6044
6045   insn = emit_move_insn (mem, reg);
6046   RTX_FRAME_RELATED_P (insn) = 1;
6047 }
6048
6049 /* Generate and return an instruction to store the pair of registers
6050    REG and REG2 of mode MODE to location BASE with write-back adjusting
6051    the stack location BASE by ADJUSTMENT.  */
6052
6053 static rtx
6054 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
6055                           HOST_WIDE_INT adjustment)
6056 {
6057   switch (mode)
6058     {
6059     case E_DImode:
6060       return gen_storewb_pairdi_di (base, base, reg, reg2,
6061                                     GEN_INT (-adjustment),
6062                                     GEN_INT (UNITS_PER_WORD - adjustment));
6063     case E_DFmode:
6064       return gen_storewb_pairdf_di (base, base, reg, reg2,
6065                                     GEN_INT (-adjustment),
6066                                     GEN_INT (UNITS_PER_WORD - adjustment));
6067     case E_TFmode:
6068       return gen_storewb_pairtf_di (base, base, reg, reg2,
6069                                     GEN_INT (-adjustment),
6070                                     GEN_INT (UNITS_PER_VREG - adjustment));
6071     default:
6072       gcc_unreachable ();
6073     }
6074 }
6075
6076 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
6077    stack pointer by ADJUSTMENT.  */
6078
6079 static void
6080 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
6081 {
6082   rtx_insn *insn;
6083   machine_mode mode = aarch64_reg_save_mode (regno1);
6084
6085   if (regno2 == INVALID_REGNUM)
6086     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
6087
6088   rtx reg1 = gen_rtx_REG (mode, regno1);
6089   rtx reg2 = gen_rtx_REG (mode, regno2);
6090
6091   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
6092                                               reg2, adjustment));
6093   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
6094   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
6095   RTX_FRAME_RELATED_P (insn) = 1;
6096 }
6097
6098 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
6099    adjusting it by ADJUSTMENT afterwards.  */
6100
6101 static rtx
6102 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
6103                          HOST_WIDE_INT adjustment)
6104 {
6105   switch (mode)
6106     {
6107     case E_DImode:
6108       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
6109                                    GEN_INT (UNITS_PER_WORD));
6110     case E_DFmode:
6111       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
6112                                    GEN_INT (UNITS_PER_WORD));
6113     case E_TFmode:
6114       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
6115                                    GEN_INT (UNITS_PER_VREG));
6116     default:
6117       gcc_unreachable ();
6118     }
6119 }
6120
6121 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
6122    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
6123    into CFI_OPS.  */
6124
6125 static void
6126 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
6127                   rtx *cfi_ops)
6128 {
6129   machine_mode mode = aarch64_reg_save_mode (regno1);
6130   rtx reg1 = gen_rtx_REG (mode, regno1);
6131
6132   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
6133
6134   if (regno2 == INVALID_REGNUM)
6135     {
6136       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
6137       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
6138       emit_move_insn (reg1, gen_frame_mem (mode, mem));
6139     }
6140   else
6141     {
6142       rtx reg2 = gen_rtx_REG (mode, regno2);
6143       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
6144       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
6145                                           reg2, adjustment));
6146     }
6147 }
6148
6149 /* Generate and return a store pair instruction of mode MODE to store
6150    register REG1 to MEM1 and register REG2 to MEM2.  */
6151
6152 static rtx
6153 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
6154                         rtx reg2)
6155 {
6156   switch (mode)
6157     {
6158     case E_DImode:
6159       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
6160
6161     case E_DFmode:
6162       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
6163
6164     case E_TFmode:
6165       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
6166
6167     default:
6168       gcc_unreachable ();
6169     }
6170 }
6171
6172 /* Generate and regurn a load pair isntruction of mode MODE to load register
6173    REG1 from MEM1 and register REG2 from MEM2.  */
6174
6175 static rtx
6176 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
6177                        rtx mem2)
6178 {
6179   switch (mode)
6180     {
6181     case E_DImode:
6182       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
6183
6184     case E_DFmode:
6185       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
6186
6187     case E_TFmode:
6188       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
6189
6190     default:
6191       gcc_unreachable ();
6192     }
6193 }
6194
6195 /* Return TRUE if return address signing should be enabled for the current
6196    function, otherwise return FALSE.  */
6197
6198 bool
6199 aarch64_return_address_signing_enabled (void)
6200 {
6201   /* This function should only be called after frame laid out.   */
6202   gcc_assert (cfun->machine->frame.laid_out);
6203
6204   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
6205      if its LR is pushed onto stack.  */
6206   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
6207           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
6208               && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
6209 }
6210
6211 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
6212 bool
6213 aarch64_bti_enabled (void)
6214 {
6215   return (aarch64_enable_bti == 1);
6216 }
6217
6218 /* The caller is going to use ST1D or LD1D to save or restore an SVE
6219    register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
6220    the range [1, 16] * GET_MODE_SIZE (MODE).  Prepare for this by:
6221
6222      (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
6223          or LD1D address
6224
6225      (2) setting PRED to a valid predicate register for the ST1D or LD1D,
6226          if the variable isn't already nonnull
6227
6228    (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
6229    Handle this case using a temporary base register that is suitable for
6230    all offsets in that range.  Use ANCHOR_REG as this base register if it
6231    is nonnull, otherwise create a new register and store it in ANCHOR_REG.  */
6232
6233 static inline void
6234 aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
6235                                      rtx &anchor_reg, poly_int64 &offset,
6236                                      rtx &ptrue)
6237 {
6238   if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
6239     {
6240       /* This is the maximum valid offset of the anchor from the base.
6241          Lower values would be valid too.  */
6242       poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
6243       if (!anchor_reg)
6244         {
6245           anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6246           emit_insn (gen_add3_insn (anchor_reg, base_rtx,
6247                                     gen_int_mode (anchor_offset, Pmode)));
6248         }
6249       base_rtx = anchor_reg;
6250       offset -= anchor_offset;
6251     }
6252   if (!ptrue)
6253     {
6254       int pred_reg = cfun->machine->frame.spare_pred_reg;
6255       emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
6256                       CONSTM1_RTX (VNx16BImode));
6257       ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
6258     }
6259 }
6260
6261 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6262    is saved at BASE + OFFSET.  */
6263
6264 static void
6265 aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
6266                             rtx base, poly_int64 offset)
6267 {
6268   rtx mem = gen_frame_mem (GET_MODE (reg),
6269                            plus_constant (Pmode, base, offset));
6270   add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
6271 }
6272
6273 /* Emit code to save the callee-saved registers from register number START
6274    to LIMIT to the stack at the location starting at offset START_OFFSET,
6275    skipping any write-back candidates if SKIP_WB is true.  HARD_FP_VALID_P
6276    is true if the hard frame pointer has been set up.  */
6277
6278 static void
6279 aarch64_save_callee_saves (poly_int64 start_offset,
6280                            unsigned start, unsigned limit, bool skip_wb,
6281                            bool hard_fp_valid_p)
6282 {
6283   rtx_insn *insn;
6284   unsigned regno;
6285   unsigned regno2;
6286   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
6287
6288   for (regno = aarch64_next_callee_save (start, limit);
6289        regno <= limit;
6290        regno = aarch64_next_callee_save (regno + 1, limit))
6291     {
6292       rtx reg, mem;
6293       poly_int64 offset;
6294       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
6295
6296       if (skip_wb
6297           && (regno == cfun->machine->frame.wb_candidate1
6298               || regno == cfun->machine->frame.wb_candidate2))
6299         continue;
6300
6301       if (cfun->machine->reg_is_wrapped_separately[regno])
6302         continue;
6303
6304       machine_mode mode = aarch64_reg_save_mode (regno);
6305       reg = gen_rtx_REG (mode, regno);
6306       offset = start_offset + cfun->machine->frame.reg_offset[regno];
6307       rtx base_rtx = stack_pointer_rtx;
6308       poly_int64 sp_offset = offset;
6309
6310       HOST_WIDE_INT const_offset;
6311       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6312         aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
6313                                              offset, ptrue);
6314       else if (GP_REGNUM_P (regno)
6315                && (!offset.is_constant (&const_offset) || const_offset >= 512))
6316         {
6317           gcc_assert (known_eq (start_offset, 0));
6318           poly_int64 fp_offset
6319             = cfun->machine->frame.below_hard_fp_saved_regs_size;
6320           if (hard_fp_valid_p)
6321             base_rtx = hard_frame_pointer_rtx;
6322           else
6323             {
6324               if (!anchor_reg)
6325                 {
6326                   anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6327                   emit_insn (gen_add3_insn (anchor_reg, base_rtx,
6328                                             gen_int_mode (fp_offset, Pmode)));
6329                 }
6330               base_rtx = anchor_reg;
6331             }
6332           offset -= fp_offset;
6333         }
6334       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
6335       bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
6336
6337       if (!aarch64_sve_mode_p (mode)
6338           && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
6339           && !cfun->machine->reg_is_wrapped_separately[regno2]
6340           && known_eq (GET_MODE_SIZE (mode),
6341                        cfun->machine->frame.reg_offset[regno2]
6342                        - cfun->machine->frame.reg_offset[regno]))
6343         {
6344           rtx reg2 = gen_rtx_REG (mode, regno2);
6345           rtx mem2;
6346
6347           offset += GET_MODE_SIZE (mode);
6348           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
6349           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
6350                                                     reg2));
6351
6352           /* The first part of a frame-related parallel insn is
6353              always assumed to be relevant to the frame
6354              calculations; subsequent parts, are only
6355              frame-related if explicitly marked.  */
6356           if (aarch64_emit_cfi_for_reg_p (regno2))
6357             {
6358               if (need_cfa_note_p)
6359                 aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx,
6360                                             sp_offset + GET_MODE_SIZE (mode));
6361               else
6362                 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
6363             }
6364
6365           regno = regno2;
6366         }
6367       else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6368         {
6369           insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg));
6370           need_cfa_note_p = true;
6371         }
6372       else if (aarch64_sve_mode_p (mode))
6373         insn = emit_insn (gen_rtx_SET (mem, reg));
6374       else
6375         insn = emit_move_insn (mem, reg);
6376
6377       RTX_FRAME_RELATED_P (insn) = frame_related_p;
6378       if (frame_related_p && need_cfa_note_p)
6379         aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset);
6380     }
6381 }
6382
6383 /* Emit code to restore the callee registers from register number START
6384    up to and including LIMIT.  Restore from the stack offset START_OFFSET,
6385    skipping any write-back candidates if SKIP_WB is true.  Write the
6386    appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
6387
6388 static void
6389 aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
6390                               unsigned limit, bool skip_wb, rtx *cfi_ops)
6391 {
6392   unsigned regno;
6393   unsigned regno2;
6394   poly_int64 offset;
6395   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
6396
6397   for (regno = aarch64_next_callee_save (start, limit);
6398        regno <= limit;
6399        regno = aarch64_next_callee_save (regno + 1, limit))
6400     {
6401       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
6402       if (cfun->machine->reg_is_wrapped_separately[regno])
6403         continue;
6404
6405       rtx reg, mem;
6406
6407       if (skip_wb
6408           && (regno == cfun->machine->frame.wb_candidate1
6409               || regno == cfun->machine->frame.wb_candidate2))
6410         continue;
6411
6412       machine_mode mode = aarch64_reg_save_mode (regno);
6413       reg = gen_rtx_REG (mode, regno);
6414       offset = start_offset + cfun->machine->frame.reg_offset[regno];
6415       rtx base_rtx = stack_pointer_rtx;
6416       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6417         aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
6418                                              offset, ptrue);
6419       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
6420
6421       if (!aarch64_sve_mode_p (mode)
6422           && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
6423           && !cfun->machine->reg_is_wrapped_separately[regno2]
6424           && known_eq (GET_MODE_SIZE (mode),
6425                        cfun->machine->frame.reg_offset[regno2]
6426                        - cfun->machine->frame.reg_offset[regno]))
6427         {
6428           rtx reg2 = gen_rtx_REG (mode, regno2);
6429           rtx mem2;
6430
6431           offset += GET_MODE_SIZE (mode);
6432           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
6433           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
6434
6435           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
6436           regno = regno2;
6437         }
6438       else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6439         emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
6440       else if (aarch64_sve_mode_p (mode))
6441         emit_insn (gen_rtx_SET (reg, mem));
6442       else
6443         emit_move_insn (reg, mem);
6444       if (frame_related_p)
6445         *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
6446     }
6447 }
6448
6449 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
6450    of MODE.  */
6451
6452 static inline bool
6453 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
6454 {
6455   HOST_WIDE_INT multiple;
6456   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6457           && IN_RANGE (multiple, -8, 7));
6458 }
6459
6460 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
6461    of MODE.  */
6462
6463 static inline bool
6464 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
6465 {
6466   HOST_WIDE_INT multiple;
6467   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6468           && IN_RANGE (multiple, 0, 63));
6469 }
6470
6471 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
6472    of MODE.  */
6473
6474 bool
6475 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
6476 {
6477   HOST_WIDE_INT multiple;
6478   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6479           && IN_RANGE (multiple, -64, 63));
6480 }
6481
6482 /* Return true if OFFSET is a signed 9-bit value.  */
6483
6484 bool
6485 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
6486                                        poly_int64 offset)
6487 {
6488   HOST_WIDE_INT const_offset;
6489   return (offset.is_constant (&const_offset)
6490           && IN_RANGE (const_offset, -256, 255));
6491 }
6492
6493 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
6494    of MODE.  */
6495
6496 static inline bool
6497 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
6498 {
6499   HOST_WIDE_INT multiple;
6500   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6501           && IN_RANGE (multiple, -256, 255));
6502 }
6503
6504 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
6505    of MODE.  */
6506
6507 static inline bool
6508 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
6509 {
6510   HOST_WIDE_INT multiple;
6511   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6512           && IN_RANGE (multiple, 0, 4095));
6513 }
6514
6515 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
6516
6517 static sbitmap
6518 aarch64_get_separate_components (void)
6519 {
6520   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
6521   bitmap_clear (components);
6522
6523   /* The registers we need saved to the frame.  */
6524   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6525     if (aarch64_register_saved_on_entry (regno))
6526       {
6527         /* Punt on saves and restores that use ST1D and LD1D.  We could
6528            try to be smarter, but it would involve making sure that the
6529            spare predicate register itself is safe to use at the save
6530            and restore points.  Also, when a frame pointer is being used,
6531            the slots are often out of reach of ST1D and LD1D anyway.  */
6532         machine_mode mode = aarch64_reg_save_mode (regno);
6533         if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6534           continue;
6535
6536         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
6537
6538         /* If the register is saved in the first SVE save slot, we use
6539            it as a stack probe for -fstack-clash-protection.  */
6540         if (flag_stack_clash_protection
6541             && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
6542             && known_eq (offset, 0))
6543           continue;
6544
6545         /* Get the offset relative to the register we'll use.  */
6546         if (frame_pointer_needed)
6547           offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
6548         else
6549           offset += crtl->outgoing_args_size;
6550
6551         /* Check that we can access the stack slot of the register with one
6552            direct load with no adjustments needed.  */
6553         if (aarch64_sve_mode_p (mode)
6554             ? offset_9bit_signed_scaled_p (mode, offset)
6555             : offset_12bit_unsigned_scaled_p (mode, offset))
6556           bitmap_set_bit (components, regno);
6557       }
6558
6559   /* Don't mess with the hard frame pointer.  */
6560   if (frame_pointer_needed)
6561     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
6562
6563   /* If the spare predicate register used by big-endian SVE code
6564      is call-preserved, it must be saved in the main prologue
6565      before any saves that use it.  */
6566   if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
6567     bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
6568
6569   unsigned reg1 = cfun->machine->frame.wb_candidate1;
6570   unsigned reg2 = cfun->machine->frame.wb_candidate2;
6571   /* If registers have been chosen to be stored/restored with
6572      writeback don't interfere with them to avoid having to output explicit
6573      stack adjustment instructions.  */
6574   if (reg2 != INVALID_REGNUM)
6575     bitmap_clear_bit (components, reg2);
6576   if (reg1 != INVALID_REGNUM)
6577     bitmap_clear_bit (components, reg1);
6578
6579   bitmap_clear_bit (components, LR_REGNUM);
6580   bitmap_clear_bit (components, SP_REGNUM);
6581
6582   return components;
6583 }
6584
6585 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
6586
6587 static sbitmap
6588 aarch64_components_for_bb (basic_block bb)
6589 {
6590   bitmap in = DF_LIVE_IN (bb);
6591   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
6592   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
6593
6594   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
6595   bitmap_clear (components);
6596
6597   /* Clobbered registers don't generate values in any meaningful sense,
6598      since nothing after the clobber can rely on their value.  And we can't
6599      say that partially-clobbered registers are unconditionally killed,
6600      because whether they're killed or not depends on the mode of the
6601      value they're holding.  Thus partially call-clobbered registers
6602      appear in neither the kill set nor the gen set.
6603
6604      Check manually for any calls that clobber more of a register than the
6605      current function can.  */
6606   function_abi_aggregator callee_abis;
6607   rtx_insn *insn;
6608   FOR_BB_INSNS (bb, insn)
6609     if (CALL_P (insn))
6610       callee_abis.note_callee_abi (insn_callee_abi (insn));
6611   HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
6612
6613   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
6614   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6615     if (!fixed_regs[regno]
6616         && !crtl->abi->clobbers_full_reg_p (regno)
6617         && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
6618             || bitmap_bit_p (in, regno)
6619             || bitmap_bit_p (gen, regno)
6620             || bitmap_bit_p (kill, regno)))
6621       {
6622         bitmap_set_bit (components, regno);
6623
6624         /* If there is a callee-save at an adjacent offset, add it too
6625            to increase the use of LDP/STP.  */
6626         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
6627         unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
6628
6629         if (regno2 <= LAST_SAVED_REGNUM)
6630           {
6631             poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
6632             if (regno < regno2
6633                 ? known_eq (offset + 8, offset2)
6634                 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
6635               bitmap_set_bit (components, regno2);
6636           }
6637       }
6638
6639   return components;
6640 }
6641
6642 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
6643    Nothing to do for aarch64.  */
6644
6645 static void
6646 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
6647 {
6648 }
6649
6650 /* Return the next set bit in BMP from START onwards.  Return the total number
6651    of bits in BMP if no set bit is found at or after START.  */
6652
6653 static unsigned int
6654 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
6655 {
6656   unsigned int nbits = SBITMAP_SIZE (bmp);
6657   if (start == nbits)
6658     return start;
6659
6660   gcc_assert (start < nbits);
6661   for (unsigned int i = start; i < nbits; i++)
6662     if (bitmap_bit_p (bmp, i))
6663       return i;
6664
6665   return nbits;
6666 }
6667
6668 /* Do the work for aarch64_emit_prologue_components and
6669    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
6670    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
6671    for these components or the epilogue sequence.  That is, it determines
6672    whether we should emit stores or loads and what kind of CFA notes to attach
6673    to the insns.  Otherwise the logic for the two sequences is very
6674    similar.  */
6675
6676 static void
6677 aarch64_process_components (sbitmap components, bool prologue_p)
6678 {
6679   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
6680                              ? HARD_FRAME_POINTER_REGNUM
6681                              : STACK_POINTER_REGNUM);
6682
6683   unsigned last_regno = SBITMAP_SIZE (components);
6684   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
6685   rtx_insn *insn = NULL;
6686
6687   while (regno != last_regno)
6688     {
6689       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
6690       machine_mode mode = aarch64_reg_save_mode (regno);
6691
6692       rtx reg = gen_rtx_REG (mode, regno);
6693       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
6694       if (frame_pointer_needed)
6695         offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
6696       else
6697         offset += crtl->outgoing_args_size;
6698
6699       rtx addr = plus_constant (Pmode, ptr_reg, offset);
6700       rtx mem = gen_frame_mem (mode, addr);
6701
6702       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
6703       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
6704       /* No more registers to handle after REGNO.
6705          Emit a single save/restore and exit.  */
6706       if (regno2 == last_regno)
6707         {
6708           insn = emit_insn (set);
6709           if (frame_related_p)
6710             {
6711               RTX_FRAME_RELATED_P (insn) = 1;
6712               if (prologue_p)
6713                 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6714               else
6715                 add_reg_note (insn, REG_CFA_RESTORE, reg);
6716             }
6717           break;
6718         }
6719
6720       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
6721       /* The next register is not of the same class or its offset is not
6722          mergeable with the current one into a pair.  */
6723       if (aarch64_sve_mode_p (mode)
6724           || !satisfies_constraint_Ump (mem)
6725           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
6726           || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
6727           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
6728                        GET_MODE_SIZE (mode)))
6729         {
6730           insn = emit_insn (set);
6731           if (frame_related_p)
6732             {
6733               RTX_FRAME_RELATED_P (insn) = 1;
6734               if (prologue_p)
6735                 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6736               else
6737                 add_reg_note (insn, REG_CFA_RESTORE, reg);
6738             }
6739
6740           regno = regno2;
6741           continue;
6742         }
6743
6744       bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
6745
6746       /* REGNO2 can be saved/restored in a pair with REGNO.  */
6747       rtx reg2 = gen_rtx_REG (mode, regno2);
6748       if (frame_pointer_needed)
6749         offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
6750       else
6751         offset2 += crtl->outgoing_args_size;
6752       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
6753       rtx mem2 = gen_frame_mem (mode, addr2);
6754       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
6755                              : gen_rtx_SET (reg2, mem2);
6756
6757       if (prologue_p)
6758         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
6759       else
6760         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
6761
6762       if (frame_related_p || frame_related2_p)
6763         {
6764           RTX_FRAME_RELATED_P (insn) = 1;
6765           if (prologue_p)
6766             {
6767               if (frame_related_p)
6768                 add_reg_note (insn, REG_CFA_OFFSET, set);
6769               if (frame_related2_p)
6770                 add_reg_note (insn, REG_CFA_OFFSET, set2);
6771             }
6772           else
6773             {
6774               if (frame_related_p)
6775                 add_reg_note (insn, REG_CFA_RESTORE, reg);
6776               if (frame_related2_p)
6777                 add_reg_note (insn, REG_CFA_RESTORE, reg2);
6778             }
6779         }
6780
6781       regno = aarch64_get_next_set_bit (components, regno2 + 1);
6782     }
6783 }
6784
6785 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
6786
6787 static void
6788 aarch64_emit_prologue_components (sbitmap components)
6789 {
6790   aarch64_process_components (components, true);
6791 }
6792
6793 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
6794
6795 static void
6796 aarch64_emit_epilogue_components (sbitmap components)
6797 {
6798   aarch64_process_components (components, false);
6799 }
6800
6801 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
6802
6803 static void
6804 aarch64_set_handled_components (sbitmap components)
6805 {
6806   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6807     if (bitmap_bit_p (components, regno))
6808       cfun->machine->reg_is_wrapped_separately[regno] = true;
6809 }
6810
6811 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
6812    determining the probe offset for alloca.  */
6813
6814 static HOST_WIDE_INT
6815 aarch64_stack_clash_protection_alloca_probe_range (void)
6816 {
6817   return STACK_CLASH_CALLER_GUARD;
6818 }
6819
6820
6821 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
6822    registers.  If POLY_SIZE is not large enough to require a probe this function
6823    will only adjust the stack.  When allocating the stack space
6824    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
6825    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
6826    arguments.  If we are then we ensure that any allocation larger than the ABI
6827    defined buffer needs a probe so that the invariant of having a 1KB buffer is
6828    maintained.
6829
6830    We emit barriers after each stack adjustment to prevent optimizations from
6831    breaking the invariant that we never drop the stack more than a page.  This
6832    invariant is needed to make it easier to correctly handle asynchronous
6833    events, e.g. if we were to allow the stack to be dropped by more than a page
6834    and then have multiple probes up and we take a signal somewhere in between
6835    then the signal handler doesn't know the state of the stack and can make no
6836    assumptions about which pages have been probed.  */
6837
6838 static void
6839 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
6840                                         poly_int64 poly_size,
6841                                         bool frame_related_p,
6842                                         bool final_adjustment_p)
6843 {
6844   HOST_WIDE_INT guard_size
6845     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6846   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6847   HOST_WIDE_INT min_probe_threshold
6848     = (final_adjustment_p
6849        ? guard_used_by_caller
6850        : guard_size - guard_used_by_caller);
6851   /* When doing the final adjustment for the outgoing arguments, take into
6852      account any unprobed space there is above the current SP.  There are
6853      two cases:
6854
6855      - When saving SVE registers below the hard frame pointer, we force
6856        the lowest save to take place in the prologue before doing the final
6857        adjustment (i.e. we don't allow the save to be shrink-wrapped).
6858        This acts as a probe at SP, so there is no unprobed space.
6859
6860      - When there are no SVE register saves, we use the store of the link
6861        register as a probe.  We can't assume that LR was saved at position 0
6862        though, so treat any space below it as unprobed.  */
6863   if (final_adjustment_p
6864       && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
6865     {
6866       poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
6867       if (known_ge (lr_offset, 0))
6868         min_probe_threshold -= lr_offset.to_constant ();
6869       else
6870         gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
6871     }
6872
6873   poly_int64 frame_size = cfun->machine->frame.frame_size;
6874
6875   /* We should always have a positive probe threshold.  */
6876   gcc_assert (min_probe_threshold > 0);
6877
6878   if (flag_stack_clash_protection && !final_adjustment_p)
6879     {
6880       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6881       poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
6882       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6883
6884       if (known_eq (frame_size, 0))
6885         {
6886           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
6887         }
6888       else if (known_lt (initial_adjust + sve_callee_adjust,
6889                          guard_size - guard_used_by_caller)
6890                && known_lt (final_adjust, guard_used_by_caller))
6891         {
6892           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
6893         }
6894     }
6895
6896   /* If SIZE is not large enough to require probing, just adjust the stack and
6897      exit.  */
6898   if (known_lt (poly_size, min_probe_threshold)
6899       || !flag_stack_clash_protection)
6900     {
6901       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
6902       return;
6903     }
6904
6905   HOST_WIDE_INT size;
6906   /* Handle the SVE non-constant case first.  */
6907   if (!poly_size.is_constant (&size))
6908     {
6909      if (dump_file)
6910       {
6911         fprintf (dump_file, "Stack clash SVE prologue: ");
6912         print_dec (poly_size, dump_file);
6913         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
6914       }
6915
6916       /* First calculate the amount of bytes we're actually spilling.  */
6917       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
6918                           poly_size, temp1, temp2, false, true);
6919
6920       rtx_insn *insn = get_last_insn ();
6921
6922       if (frame_related_p)
6923         {
6924           /* This is done to provide unwinding information for the stack
6925              adjustments we're about to do, however to prevent the optimizers
6926              from removing the R11 move and leaving the CFA note (which would be
6927              very wrong) we tie the old and new stack pointer together.
6928              The tie will expand to nothing but the optimizers will not touch
6929              the instruction.  */
6930           rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6931           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
6932           emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
6933
6934           /* We want the CFA independent of the stack pointer for the
6935              duration of the loop.  */
6936           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
6937           RTX_FRAME_RELATED_P (insn) = 1;
6938         }
6939
6940       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
6941       rtx guard_const = gen_int_mode (guard_size, Pmode);
6942
6943       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
6944                                                    stack_pointer_rtx, temp1,
6945                                                    probe_const, guard_const));
6946
6947       /* Now reset the CFA register if needed.  */
6948       if (frame_related_p)
6949         {
6950           add_reg_note (insn, REG_CFA_DEF_CFA,
6951                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
6952                                       gen_int_mode (poly_size, Pmode)));
6953           RTX_FRAME_RELATED_P (insn) = 1;
6954         }
6955
6956       return;
6957     }
6958
6959   if (dump_file)
6960     fprintf (dump_file,
6961              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
6962              " bytes, probing will be required.\n", size);
6963
6964   /* Round size to the nearest multiple of guard_size, and calculate the
6965      residual as the difference between the original size and the rounded
6966      size.  */
6967   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
6968   HOST_WIDE_INT residual = size - rounded_size;
6969
6970   /* We can handle a small number of allocations/probes inline.  Otherwise
6971      punt to a loop.  */
6972   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
6973     {
6974       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
6975         {
6976           aarch64_sub_sp (NULL, temp2, guard_size, true);
6977           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6978                                            guard_used_by_caller));
6979           emit_insn (gen_blockage ());
6980         }
6981       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
6982     }
6983   else
6984     {
6985       /* Compute the ending address.  */
6986       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
6987                           temp1, NULL, false, true);
6988       rtx_insn *insn = get_last_insn ();
6989
6990       /* For the initial allocation, we don't have a frame pointer
6991          set up, so we always need CFI notes.  If we're doing the
6992          final allocation, then we may have a frame pointer, in which
6993          case it is the CFA, otherwise we need CFI notes.
6994
6995          We can determine which allocation we are doing by looking at
6996          the value of FRAME_RELATED_P since the final allocations are not
6997          frame related.  */
6998       if (frame_related_p)
6999         {
7000           /* We want the CFA independent of the stack pointer for the
7001              duration of the loop.  */
7002           add_reg_note (insn, REG_CFA_DEF_CFA,
7003                         plus_constant (Pmode, temp1, rounded_size));
7004           RTX_FRAME_RELATED_P (insn) = 1;
7005         }
7006
7007       /* This allocates and probes the stack.  Note that this re-uses some of
7008          the existing Ada stack protection code.  However we are guaranteed not
7009          to enter the non loop or residual branches of that code.
7010
7011          The non-loop part won't be entered because if our allocation amount
7012          doesn't require a loop, the case above would handle it.
7013
7014          The residual amount won't be entered because TEMP1 is a mutliple of
7015          the allocation size.  The residual will always be 0.  As such, the only
7016          part we are actually using from that code is the loop setup.  The
7017          actual probing is done in aarch64_output_probe_stack_range.  */
7018       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
7019                                                stack_pointer_rtx, temp1));
7020
7021       /* Now reset the CFA register if needed.  */
7022       if (frame_related_p)
7023         {
7024           add_reg_note (insn, REG_CFA_DEF_CFA,
7025                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
7026           RTX_FRAME_RELATED_P (insn) = 1;
7027         }
7028
7029       emit_insn (gen_blockage ());
7030       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
7031     }
7032
7033   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
7034      be probed.  This maintains the requirement that each page is probed at
7035      least once.  For initial probing we probe only if the allocation is
7036      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
7037      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
7038      GUARD_SIZE.  This works that for any allocation that is large enough to
7039      trigger a probe here, we'll have at least one, and if they're not large
7040      enough for this code to emit anything for them, The page would have been
7041      probed by the saving of FP/LR either by this function or any callees.  If
7042      we don't have any callees then we won't have more stack adjustments and so
7043      are still safe.  */
7044   if (residual)
7045     {
7046       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
7047       /* If we're doing final adjustments, and we've done any full page
7048          allocations then any residual needs to be probed.  */
7049       if (final_adjustment_p && rounded_size != 0)
7050         min_probe_threshold = 0;
7051       /* If doing a small final adjustment, we always probe at offset 0.
7052          This is done to avoid issues when LR is not at position 0 or when
7053          the final adjustment is smaller than the probing offset.  */
7054       else if (final_adjustment_p && rounded_size == 0)
7055         residual_probe_offset = 0;
7056
7057       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
7058       if (residual >= min_probe_threshold)
7059         {
7060           if (dump_file)
7061             fprintf (dump_file,
7062                      "Stack clash AArch64 prologue residuals: "
7063                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
7064                      "\n", residual);
7065
7066             emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
7067                                              residual_probe_offset));
7068           emit_insn (gen_blockage ());
7069         }
7070     }
7071 }
7072
7073 /* Return 1 if the register is used by the epilogue.  We need to say the
7074    return register is used, but only after epilogue generation is complete.
7075    Note that in the case of sibcalls, the values "used by the epilogue" are
7076    considered live at the start of the called function.
7077
7078    For SIMD functions we need to return 1 for FP registers that are saved and
7079    restored by a function but are not zero in call_used_regs.  If we do not do
7080    this optimizations may remove the restore of the register.  */
7081
7082 int
7083 aarch64_epilogue_uses (int regno)
7084 {
7085   if (epilogue_completed)
7086     {
7087       if (regno == LR_REGNUM)
7088         return 1;
7089     }
7090   return 0;
7091 }
7092
7093 /* AArch64 stack frames generated by this compiler look like:
7094
7095         +-------------------------------+
7096         |                               |
7097         |  incoming stack arguments     |
7098         |                               |
7099         +-------------------------------+
7100         |                               | <-- incoming stack pointer (aligned)
7101         |  callee-allocated save area   |
7102         |  for register varargs         |
7103         |                               |
7104         +-------------------------------+
7105         |  local variables              | <-- frame_pointer_rtx
7106         |                               |
7107         +-------------------------------+
7108         |  padding                      | \
7109         +-------------------------------+  |
7110         |  callee-saved registers       |  | frame.saved_regs_size
7111         +-------------------------------+  |
7112         |  LR'                          |  |
7113         +-------------------------------+  |
7114         |  FP'                          |  |
7115         +-------------------------------+  |<- hard_frame_pointer_rtx (aligned)
7116         |  SVE vector registers         |  | \
7117         +-------------------------------+  |  | below_hard_fp_saved_regs_size
7118         |  SVE predicate registers      | /  /
7119         +-------------------------------+
7120         |  dynamic allocation           |
7121         +-------------------------------+
7122         |  padding                      |
7123         +-------------------------------+
7124         |  outgoing stack arguments     | <-- arg_pointer
7125         |                               |
7126         +-------------------------------+
7127         |                               | <-- stack_pointer_rtx (aligned)
7128
7129    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
7130    but leave frame_pointer_rtx and hard_frame_pointer_rtx
7131    unchanged.
7132
7133    By default for stack-clash we assume the guard is at least 64KB, but this
7134    value is configurable to either 4KB or 64KB.  We also force the guard size to
7135    be the same as the probing interval and both values are kept in sync.
7136
7137    With those assumptions the callee can allocate up to 63KB (or 3KB depending
7138    on the guard size) of stack space without probing.
7139
7140    When probing is needed, we emit a probe at the start of the prologue
7141    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
7142
7143    We have to track how much space has been allocated and the only stores
7144    to the stack we track as implicit probes are the FP/LR stores.
7145
7146    For outgoing arguments we probe if the size is larger than 1KB, such that
7147    the ABI specified buffer is maintained for the next callee.
7148
7149    The following registers are reserved during frame layout and should not be
7150    used for any other purpose:
7151
7152    - r11: Used by stack clash protection when SVE is enabled, and also
7153           as an anchor register when saving and restoring registers
7154    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
7155    - r14 and r15: Used for speculation tracking.
7156    - r16(IP0), r17(IP1): Used by indirect tailcalls.
7157    - r30(LR), r29(FP): Used by standard frame layout.
7158
7159    These registers must be avoided in frame layout related code unless the
7160    explicit intention is to interact with one of the features listed above.  */
7161
7162 /* Generate the prologue instructions for entry into a function.
7163    Establish the stack frame by decreasing the stack pointer with a
7164    properly calculated size and, if necessary, create a frame record
7165    filled with the values of LR and previous frame pointer.  The
7166    current FP is also set up if it is in use.  */
7167
7168 void
7169 aarch64_expand_prologue (void)
7170 {
7171   poly_int64 frame_size = cfun->machine->frame.frame_size;
7172   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
7173   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
7174   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7175   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
7176   poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
7177   poly_int64 below_hard_fp_saved_regs_size
7178     = cfun->machine->frame.below_hard_fp_saved_regs_size;
7179   unsigned reg1 = cfun->machine->frame.wb_candidate1;
7180   unsigned reg2 = cfun->machine->frame.wb_candidate2;
7181   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
7182   rtx_insn *insn;
7183
7184   if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
7185     {
7186       /* Fold the SVE allocation into the initial allocation.
7187          We don't do this in aarch64_layout_arg to avoid pessimizing
7188          the epilogue code.  */
7189       initial_adjust += sve_callee_adjust;
7190       sve_callee_adjust = 0;
7191     }
7192
7193   /* Sign return address for functions.  */
7194   if (aarch64_return_address_signing_enabled ())
7195     {
7196       switch (aarch64_ra_sign_key)
7197         {
7198           case AARCH64_KEY_A:
7199             insn = emit_insn (gen_paciasp ());
7200             break;
7201           case AARCH64_KEY_B:
7202             insn = emit_insn (gen_pacibsp ());
7203             break;
7204           default:
7205             gcc_unreachable ();
7206         }
7207       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
7208       RTX_FRAME_RELATED_P (insn) = 1;
7209     }
7210
7211   if (flag_stack_usage_info)
7212     current_function_static_stack_size = constant_lower_bound (frame_size);
7213
7214   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
7215     {
7216       if (crtl->is_leaf && !cfun->calls_alloca)
7217         {
7218           if (maybe_gt (frame_size, PROBE_INTERVAL)
7219               && maybe_gt (frame_size, get_stack_check_protect ()))
7220             aarch64_emit_probe_stack_range (get_stack_check_protect (),
7221                                             (frame_size
7222                                              - get_stack_check_protect ()));
7223         }
7224       else if (maybe_gt (frame_size, 0))
7225         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
7226     }
7227
7228   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
7229   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
7230
7231   /* In theory we should never have both an initial adjustment
7232      and a callee save adjustment.  Verify that is the case since the
7233      code below does not handle it for -fstack-clash-protection.  */
7234   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
7235
7236   /* Will only probe if the initial adjustment is larger than the guard
7237      less the amount of the guard reserved for use by the caller's
7238      outgoing args.  */
7239   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
7240                                           true, false);
7241
7242   if (callee_adjust != 0)
7243     aarch64_push_regs (reg1, reg2, callee_adjust);
7244
7245   /* The offset of the frame chain record (if any) from the current SP.  */
7246   poly_int64 chain_offset = (initial_adjust + callee_adjust
7247                              - cfun->machine->frame.hard_fp_offset);
7248   gcc_assert (known_ge (chain_offset, 0));
7249
7250   /* The offset of the bottom of the save area from the current SP.  */
7251   poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
7252
7253   if (emit_frame_chain)
7254     {
7255       if (callee_adjust == 0)
7256         {
7257           reg1 = R29_REGNUM;
7258           reg2 = R30_REGNUM;
7259           aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
7260                                      false, false);
7261         }
7262       else
7263         gcc_assert (known_eq (chain_offset, 0));
7264       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
7265                           stack_pointer_rtx, chain_offset,
7266                           tmp1_rtx, tmp0_rtx, frame_pointer_needed);
7267       if (frame_pointer_needed && !frame_size.is_constant ())
7268         {
7269           /* Variable-sized frames need to describe the save slot
7270              address using DW_CFA_expression rather than DW_CFA_offset.
7271              This means that, without taking further action, the
7272              locations of the registers that we've already saved would
7273              remain based on the stack pointer even after we redefine
7274              the CFA based on the frame pointer.  We therefore need new
7275              DW_CFA_expressions to re-express the save slots with addresses
7276              based on the frame pointer.  */
7277           rtx_insn *insn = get_last_insn ();
7278           gcc_assert (RTX_FRAME_RELATED_P (insn));
7279
7280           /* Add an explicit CFA definition if this was previously
7281              implicit.  */
7282           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
7283             {
7284               rtx src = plus_constant (Pmode, stack_pointer_rtx,
7285                                        callee_offset);
7286               add_reg_note (insn, REG_CFA_ADJUST_CFA,
7287                             gen_rtx_SET (hard_frame_pointer_rtx, src));
7288             }
7289
7290           /* Change the save slot expressions for the registers that
7291              we've already saved.  */
7292           aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
7293                                       hard_frame_pointer_rtx, UNITS_PER_WORD);
7294           aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
7295                                       hard_frame_pointer_rtx, 0);
7296         }
7297       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
7298     }
7299
7300   aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
7301                              callee_adjust != 0 || emit_frame_chain,
7302                              emit_frame_chain);
7303   if (maybe_ne (sve_callee_adjust, 0))
7304     {
7305       gcc_assert (!flag_stack_clash_protection
7306                   || known_eq (initial_adjust, 0));
7307       aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
7308                                               sve_callee_adjust,
7309                                               !frame_pointer_needed, false);
7310       saved_regs_offset += sve_callee_adjust;
7311     }
7312   aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
7313                              false, emit_frame_chain);
7314   aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
7315                              callee_adjust != 0 || emit_frame_chain,
7316                              emit_frame_chain);
7317
7318   /* We may need to probe the final adjustment if it is larger than the guard
7319      that is assumed by the called.  */
7320   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
7321                                           !frame_pointer_needed, true);
7322 }
7323
7324 /* Return TRUE if we can use a simple_return insn.
7325
7326    This function checks whether the callee saved stack is empty, which
7327    means no restore actions are need. The pro_and_epilogue will use
7328    this to check whether shrink-wrapping opt is feasible.  */
7329
7330 bool
7331 aarch64_use_return_insn_p (void)
7332 {
7333   if (!reload_completed)
7334     return false;
7335
7336   if (crtl->profile)
7337     return false;
7338
7339   return known_eq (cfun->machine->frame.frame_size, 0);
7340 }
7341
7342 /* Generate the epilogue instructions for returning from a function.
7343    This is almost exactly the reverse of the prolog sequence, except
7344    that we need to insert barriers to avoid scheduling loads that read
7345    from a deallocated stack, and we optimize the unwind records by
7346    emitting them all together if possible.  */
7347 void
7348 aarch64_expand_epilogue (bool for_sibcall)
7349 {
7350   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
7351   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
7352   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7353   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
7354   poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
7355   poly_int64 below_hard_fp_saved_regs_size
7356     = cfun->machine->frame.below_hard_fp_saved_regs_size;
7357   unsigned reg1 = cfun->machine->frame.wb_candidate1;
7358   unsigned reg2 = cfun->machine->frame.wb_candidate2;
7359   rtx cfi_ops = NULL;
7360   rtx_insn *insn;
7361   /* A stack clash protection prologue may not have left EP0_REGNUM or
7362      EP1_REGNUM in a usable state.  The same is true for allocations
7363      with an SVE component, since we then need both temporary registers
7364      for each allocation.  For stack clash we are in a usable state if
7365      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
7366   HOST_WIDE_INT guard_size
7367     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
7368   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
7369
7370   /* We can re-use the registers when:
7371
7372      (a) the deallocation amount is the same as the corresponding
7373          allocation amount (which is false if we combine the initial
7374          and SVE callee save allocations in the prologue); and
7375
7376      (b) the allocation amount doesn't need a probe (which is false
7377          if the amount is guard_size - guard_used_by_caller or greater).
7378
7379      In such situations the register should remain live with the correct
7380      value.  */
7381   bool can_inherit_p = (initial_adjust.is_constant ()
7382                         && final_adjust.is_constant ()
7383                         && (!flag_stack_clash_protection
7384                             || (known_lt (initial_adjust,
7385                                           guard_size - guard_used_by_caller)
7386                                 && known_eq (sve_callee_adjust, 0))));
7387
7388   /* We need to add memory barrier to prevent read from deallocated stack.  */
7389   bool need_barrier_p
7390     = maybe_ne (get_frame_size ()
7391                 + cfun->machine->frame.saved_varargs_size, 0);
7392
7393   /* Emit a barrier to prevent loads from a deallocated stack.  */
7394   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
7395       || cfun->calls_alloca
7396       || crtl->calls_eh_return)
7397     {
7398       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
7399       need_barrier_p = false;
7400     }
7401
7402   /* Restore the stack pointer from the frame pointer if it may not
7403      be the same as the stack pointer.  */
7404   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
7405   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
7406   if (frame_pointer_needed
7407       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
7408     /* If writeback is used when restoring callee-saves, the CFA
7409        is restored on the instruction doing the writeback.  */
7410     aarch64_add_offset (Pmode, stack_pointer_rtx,
7411                         hard_frame_pointer_rtx,
7412                         -callee_offset - below_hard_fp_saved_regs_size,
7413                         tmp1_rtx, tmp0_rtx, callee_adjust == 0);
7414   else
7415      /* The case where we need to re-use the register here is very rare, so
7416         avoid the complicated condition and just always emit a move if the
7417         immediate doesn't fit.  */
7418      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
7419
7420   /* Restore the vector registers before the predicate registers,
7421      so that we can use P4 as a temporary for big-endian SVE frames.  */
7422   aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
7423                                 callee_adjust != 0, &cfi_ops);
7424   aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
7425                                 false, &cfi_ops);
7426   if (maybe_ne (sve_callee_adjust, 0))
7427     aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
7428   aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
7429                                 R0_REGNUM, R30_REGNUM,
7430                                 callee_adjust != 0, &cfi_ops);
7431
7432   if (need_barrier_p)
7433     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
7434
7435   if (callee_adjust != 0)
7436     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
7437
7438   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
7439     {
7440       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
7441       insn = get_last_insn ();
7442       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
7443       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
7444       RTX_FRAME_RELATED_P (insn) = 1;
7445       cfi_ops = NULL;
7446     }
7447
7448   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
7449      add restriction on emit_move optimization to leaf functions.  */
7450   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
7451                   (!can_inherit_p || !crtl->is_leaf
7452                    || df_regs_ever_live_p (EP0_REGNUM)));
7453
7454   if (cfi_ops)
7455     {
7456       /* Emit delayed restores and reset the CFA to be SP.  */
7457       insn = get_last_insn ();
7458       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
7459       REG_NOTES (insn) = cfi_ops;
7460       RTX_FRAME_RELATED_P (insn) = 1;
7461     }
7462
7463   /* We prefer to emit the combined return/authenticate instruction RETAA,
7464      however there are three cases in which we must instead emit an explicit
7465      authentication instruction.
7466
7467         1) Sibcalls don't return in a normal way, so if we're about to call one
7468            we must authenticate.
7469
7470         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
7471            generating code for !TARGET_ARMV8_3 we can't use it and must
7472            explicitly authenticate.
7473
7474         3) On an eh_return path we make extra stack adjustments to update the
7475            canonical frame address to be the exception handler's CFA.  We want
7476            to authenticate using the CFA of the function which calls eh_return.
7477     */
7478   if (aarch64_return_address_signing_enabled ()
7479       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
7480     {
7481       switch (aarch64_ra_sign_key)
7482         {
7483           case AARCH64_KEY_A:
7484             insn = emit_insn (gen_autiasp ());
7485             break;
7486           case AARCH64_KEY_B:
7487             insn = emit_insn (gen_autibsp ());
7488             break;
7489           default:
7490             gcc_unreachable ();
7491         }
7492       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
7493       RTX_FRAME_RELATED_P (insn) = 1;
7494     }
7495
7496   /* Stack adjustment for exception handler.  */
7497   if (crtl->calls_eh_return && !for_sibcall)
7498     {
7499       /* We need to unwind the stack by the offset computed by
7500          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
7501          to be SP; letting the CFA move during this adjustment
7502          is just as correct as retaining the CFA from the body
7503          of the function.  Therefore, do nothing special.  */
7504       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
7505     }
7506
7507   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
7508   if (!for_sibcall)
7509     emit_jump_insn (ret_rtx);
7510 }
7511
7512 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
7513    normally or return to a previous frame after unwinding.
7514
7515    An EH return uses a single shared return sequence.  The epilogue is
7516    exactly like a normal epilogue except that it has an extra input
7517    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
7518    that must be applied after the frame has been destroyed.  An extra label
7519    is inserted before the epilogue which initializes this register to zero,
7520    and this is the entry point for a normal return.
7521
7522    An actual EH return updates the return address, initializes the stack
7523    adjustment and jumps directly into the epilogue (bypassing the zeroing
7524    of the adjustment).  Since the return address is typically saved on the
7525    stack when a function makes a call, the saved LR must be updated outside
7526    the epilogue.
7527
7528    This poses problems as the store is generated well before the epilogue,
7529    so the offset of LR is not known yet.  Also optimizations will remove the
7530    store as it appears dead, even after the epilogue is generated (as the
7531    base or offset for loading LR is different in many cases).
7532
7533    To avoid these problems this implementation forces the frame pointer
7534    in eh_return functions so that the location of LR is fixed and known early.
7535    It also marks the store volatile, so no optimization is permitted to
7536    remove the store.  */
7537 rtx
7538 aarch64_eh_return_handler_rtx (void)
7539 {
7540   rtx tmp = gen_frame_mem (Pmode,
7541     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
7542
7543   /* Mark the store volatile, so no optimization is permitted to remove it.  */
7544   MEM_VOLATILE_P (tmp) = true;
7545   return tmp;
7546 }
7547
7548 /* Output code to add DELTA to the first argument, and then jump
7549    to FUNCTION.  Used for C++ multiple inheritance.  */
7550 static void
7551 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
7552                          HOST_WIDE_INT delta,
7553                          HOST_WIDE_INT vcall_offset,
7554                          tree function)
7555 {
7556   /* The this pointer is always in x0.  Note that this differs from
7557      Arm where the this pointer maybe bumped to r1 if r0 is required
7558      to return a pointer to an aggregate.  On AArch64 a result value
7559      pointer will be in x8.  */
7560   int this_regno = R0_REGNUM;
7561   rtx this_rtx, temp0, temp1, addr, funexp;
7562   rtx_insn *insn;
7563   const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
7564
7565   if (aarch64_bti_enabled ())
7566     emit_insn (gen_bti_c());
7567
7568   reload_completed = 1;
7569   emit_note (NOTE_INSN_PROLOGUE_END);
7570
7571   this_rtx = gen_rtx_REG (Pmode, this_regno);
7572   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
7573   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
7574
7575   if (vcall_offset == 0)
7576     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
7577   else
7578     {
7579       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
7580
7581       addr = this_rtx;
7582       if (delta != 0)
7583         {
7584           if (delta >= -256 && delta < 256)
7585             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
7586                                        plus_constant (Pmode, this_rtx, delta));
7587           else
7588             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
7589                                 temp1, temp0, false);
7590         }
7591
7592       if (Pmode == ptr_mode)
7593         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
7594       else
7595         aarch64_emit_move (temp0,
7596                            gen_rtx_ZERO_EXTEND (Pmode,
7597                                                 gen_rtx_MEM (ptr_mode, addr)));
7598
7599       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
7600           addr = plus_constant (Pmode, temp0, vcall_offset);
7601       else
7602         {
7603           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
7604                                           Pmode);
7605           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
7606         }
7607
7608       if (Pmode == ptr_mode)
7609         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
7610       else
7611         aarch64_emit_move (temp1,
7612                            gen_rtx_SIGN_EXTEND (Pmode,
7613                                                 gen_rtx_MEM (ptr_mode, addr)));
7614
7615       emit_insn (gen_add2_insn (this_rtx, temp1));
7616     }
7617
7618   /* Generate a tail call to the target function.  */
7619   if (!TREE_USED (function))
7620     {
7621       assemble_external (function);
7622       TREE_USED (function) = 1;
7623     }
7624   funexp = XEXP (DECL_RTL (function), 0);
7625   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
7626   rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode);
7627   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
7628   SIBLING_CALL_P (insn) = 1;
7629
7630   insn = get_insns ();
7631   shorten_branches (insn);
7632
7633   assemble_start_function (thunk, fnname);
7634   final_start_function (insn, file, 1);
7635   final (insn, file, 1);
7636   final_end_function ();
7637   assemble_end_function (thunk, fnname);
7638
7639   /* Stop pretending to be a post-reload pass.  */
7640   reload_completed = 0;
7641 }
7642
7643 static bool
7644 aarch64_tls_referenced_p (rtx x)
7645 {
7646   if (!TARGET_HAVE_TLS)
7647     return false;
7648   subrtx_iterator::array_type array;
7649   FOR_EACH_SUBRTX (iter, array, x, ALL)
7650     {
7651       const_rtx x = *iter;
7652       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
7653         return true;
7654       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
7655          TLS offsets, not real symbol references.  */
7656       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7657         iter.skip_subrtxes ();
7658     }
7659   return false;
7660 }
7661
7662
7663 /* Return true if val can be encoded as a 12-bit unsigned immediate with
7664    a left shift of 0 or 12 bits.  */
7665 bool
7666 aarch64_uimm12_shift (HOST_WIDE_INT val)
7667 {
7668   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
7669           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
7670           );
7671 }
7672
7673 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
7674    that can be created with a left shift of 0 or 12.  */
7675 static HOST_WIDE_INT
7676 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
7677 {
7678   /* Check to see if the value fits in 24 bits, as that is the maximum we can
7679      handle correctly.  */
7680   gcc_assert ((val & 0xffffff) == val);
7681
7682   if (((val & 0xfff) << 0) == val)
7683     return val;
7684
7685   return val & (0xfff << 12);
7686 }
7687
7688 /* Return true if val is an immediate that can be loaded into a
7689    register by a MOVZ instruction.  */
7690 static bool
7691 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
7692 {
7693   if (GET_MODE_SIZE (mode) > 4)
7694     {
7695       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
7696           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
7697         return 1;
7698     }
7699   else
7700     {
7701       /* Ignore sign extension.  */
7702       val &= (HOST_WIDE_INT) 0xffffffff;
7703     }
7704   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
7705           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
7706 }
7707
7708 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
7709    64-bit (DImode) integer.  */
7710
7711 static unsigned HOST_WIDE_INT
7712 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
7713 {
7714   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
7715   while (size < 64)
7716     {
7717       val &= (HOST_WIDE_INT_1U << size) - 1;
7718       val |= val << size;
7719       size *= 2;
7720     }
7721   return val;
7722 }
7723
7724 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
7725
7726 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
7727   {
7728     0x0000000100000001ull,
7729     0x0001000100010001ull,
7730     0x0101010101010101ull,
7731     0x1111111111111111ull,
7732     0x5555555555555555ull,
7733   };
7734
7735
7736 /* Return true if val is a valid bitmask immediate.  */
7737
7738 bool
7739 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
7740 {
7741   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
7742   int bits;
7743
7744   /* Check for a single sequence of one bits and return quickly if so.
7745      The special cases of all ones and all zeroes returns false.  */
7746   val = aarch64_replicate_bitmask_imm (val_in, mode);
7747   tmp = val + (val & -val);
7748
7749   if (tmp == (tmp & -tmp))
7750     return (val + 1) > 1;
7751
7752   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
7753   if (mode == SImode)
7754     val = (val << 32) | (val & 0xffffffff);
7755
7756   /* Invert if the immediate doesn't start with a zero bit - this means we
7757      only need to search for sequences of one bits.  */
7758   if (val & 1)
7759     val = ~val;
7760
7761   /* Find the first set bit and set tmp to val with the first sequence of one
7762      bits removed.  Return success if there is a single sequence of ones.  */
7763   first_one = val & -val;
7764   tmp = val & (val + first_one);
7765
7766   if (tmp == 0)
7767     return true;
7768
7769   /* Find the next set bit and compute the difference in bit position.  */
7770   next_one = tmp & -tmp;
7771   bits = clz_hwi (first_one) - clz_hwi (next_one);
7772   mask = val ^ tmp;
7773
7774   /* Check the bit position difference is a power of 2, and that the first
7775      sequence of one bits fits within 'bits' bits.  */
7776   if ((mask >> bits) != 0 || bits != (bits & -bits))
7777     return false;
7778
7779   /* Check the sequence of one bits is repeated 64/bits times.  */
7780   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
7781 }
7782
7783 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
7784    Assumed precondition: VAL_IN Is not zero.  */
7785
7786 unsigned HOST_WIDE_INT
7787 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
7788 {
7789   int lowest_bit_set = ctz_hwi (val_in);
7790   int highest_bit_set = floor_log2 (val_in);
7791   gcc_assert (val_in != 0);
7792
7793   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
7794           (HOST_WIDE_INT_1U << lowest_bit_set));
7795 }
7796
7797 /* Create constant where bits outside of lowest bit set to highest bit set
7798    are set to 1.  */
7799
7800 unsigned HOST_WIDE_INT
7801 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
7802 {
7803   return val_in | ~aarch64_and_split_imm1 (val_in);
7804 }
7805
7806 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
7807
7808 bool
7809 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
7810 {
7811   scalar_int_mode int_mode;
7812   if (!is_a <scalar_int_mode> (mode, &int_mode))
7813     return false;
7814
7815   if (aarch64_bitmask_imm (val_in, int_mode))
7816     return false;
7817
7818   if (aarch64_move_imm (val_in, int_mode))
7819     return false;
7820
7821   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
7822
7823   return aarch64_bitmask_imm (imm2, int_mode);
7824 }
7825
7826 /* Return true if val is an immediate that can be loaded into a
7827    register in a single instruction.  */
7828 bool
7829 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
7830 {
7831   scalar_int_mode int_mode;
7832   if (!is_a <scalar_int_mode> (mode, &int_mode))
7833     return false;
7834
7835   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
7836     return 1;
7837   return aarch64_bitmask_imm (val, int_mode);
7838 }
7839
7840 static bool
7841 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
7842 {
7843   rtx base, offset;
7844
7845   if (GET_CODE (x) == HIGH)
7846     return true;
7847
7848   /* There's no way to calculate VL-based values using relocations.  */
7849   subrtx_iterator::array_type array;
7850   FOR_EACH_SUBRTX (iter, array, x, ALL)
7851     if (GET_CODE (*iter) == CONST_POLY_INT)
7852       return true;
7853
7854   split_const (x, &base, &offset);
7855   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
7856     {
7857       if (aarch64_classify_symbol (base, INTVAL (offset))
7858           != SYMBOL_FORCE_TO_MEM)
7859         return true;
7860       else
7861         /* Avoid generating a 64-bit relocation in ILP32; leave
7862            to aarch64_expand_mov_immediate to handle it properly.  */
7863         return mode != ptr_mode;
7864     }
7865
7866   return aarch64_tls_referenced_p (x);
7867 }
7868
7869 /* Implement TARGET_CASE_VALUES_THRESHOLD.
7870    The expansion for a table switch is quite expensive due to the number
7871    of instructions, the table lookup and hard to predict indirect jump.
7872    When optimizing for speed, and -O3 enabled, use the per-core tuning if
7873    set, otherwise use tables for > 16 cases as a tradeoff between size and
7874    performance.  When optimizing for size, use the default setting.  */
7875
7876 static unsigned int
7877 aarch64_case_values_threshold (void)
7878 {
7879   /* Use the specified limit for the number of cases before using jump
7880      tables at higher optimization levels.  */
7881   if (optimize > 2
7882       && selected_cpu->tune->max_case_values != 0)
7883     return selected_cpu->tune->max_case_values;
7884   else
7885     return optimize_size ? default_case_values_threshold () : 17;
7886 }
7887
7888 /* Return true if register REGNO is a valid index register.
7889    STRICT_P is true if REG_OK_STRICT is in effect.  */
7890
7891 bool
7892 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
7893 {
7894   if (!HARD_REGISTER_NUM_P (regno))
7895     {
7896       if (!strict_p)
7897         return true;
7898
7899       if (!reg_renumber)
7900         return false;
7901
7902       regno = reg_renumber[regno];
7903     }
7904   return GP_REGNUM_P (regno);
7905 }
7906
7907 /* Return true if register REGNO is a valid base register for mode MODE.
7908    STRICT_P is true if REG_OK_STRICT is in effect.  */
7909
7910 bool
7911 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
7912 {
7913   if (!HARD_REGISTER_NUM_P (regno))
7914     {
7915       if (!strict_p)
7916         return true;
7917
7918       if (!reg_renumber)
7919         return false;
7920
7921       regno = reg_renumber[regno];
7922     }
7923
7924   /* The fake registers will be eliminated to either the stack or
7925      hard frame pointer, both of which are usually valid base registers.
7926      Reload deals with the cases where the eliminated form isn't valid.  */
7927   return (GP_REGNUM_P (regno)
7928           || regno == SP_REGNUM
7929           || regno == FRAME_POINTER_REGNUM
7930           || regno == ARG_POINTER_REGNUM);
7931 }
7932
7933 /* Return true if X is a valid base register for mode MODE.
7934    STRICT_P is true if REG_OK_STRICT is in effect.  */
7935
7936 static bool
7937 aarch64_base_register_rtx_p (rtx x, bool strict_p)
7938 {
7939   if (!strict_p
7940       && GET_CODE (x) == SUBREG
7941       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
7942     x = SUBREG_REG (x);
7943
7944   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
7945 }
7946
7947 /* Return true if address offset is a valid index.  If it is, fill in INFO
7948    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
7949
7950 static bool
7951 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
7952                         machine_mode mode, bool strict_p)
7953 {
7954   enum aarch64_address_type type;
7955   rtx index;
7956   int shift;
7957
7958   /* (reg:P) */
7959   if ((REG_P (x) || GET_CODE (x) == SUBREG)
7960       && GET_MODE (x) == Pmode)
7961     {
7962       type = ADDRESS_REG_REG;
7963       index = x;
7964       shift = 0;
7965     }
7966   /* (sign_extend:DI (reg:SI)) */
7967   else if ((GET_CODE (x) == SIGN_EXTEND
7968             || GET_CODE (x) == ZERO_EXTEND)
7969            && GET_MODE (x) == DImode
7970            && GET_MODE (XEXP (x, 0)) == SImode)
7971     {
7972       type = (GET_CODE (x) == SIGN_EXTEND)
7973         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7974       index = XEXP (x, 0);
7975       shift = 0;
7976     }
7977   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
7978   else if (GET_CODE (x) == MULT
7979            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7980                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7981            && GET_MODE (XEXP (x, 0)) == DImode
7982            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7983            && CONST_INT_P (XEXP (x, 1)))
7984     {
7985       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7986         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7987       index = XEXP (XEXP (x, 0), 0);
7988       shift = exact_log2 (INTVAL (XEXP (x, 1)));
7989     }
7990   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
7991   else if (GET_CODE (x) == ASHIFT
7992            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7993                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7994            && GET_MODE (XEXP (x, 0)) == DImode
7995            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7996            && CONST_INT_P (XEXP (x, 1)))
7997     {
7998       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7999         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8000       index = XEXP (XEXP (x, 0), 0);
8001       shift = INTVAL (XEXP (x, 1));
8002     }
8003   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
8004   else if ((GET_CODE (x) == SIGN_EXTRACT
8005             || GET_CODE (x) == ZERO_EXTRACT)
8006            && GET_MODE (x) == DImode
8007            && GET_CODE (XEXP (x, 0)) == MULT
8008            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8009            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
8010     {
8011       type = (GET_CODE (x) == SIGN_EXTRACT)
8012         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8013       index = XEXP (XEXP (x, 0), 0);
8014       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
8015       if (INTVAL (XEXP (x, 1)) != 32 + shift
8016           || INTVAL (XEXP (x, 2)) != 0)
8017         shift = -1;
8018     }
8019   /* (and:DI (mult:DI (reg:DI) (const_int scale))
8020      (const_int 0xffffffff<<shift)) */
8021   else if (GET_CODE (x) == AND
8022            && GET_MODE (x) == DImode
8023            && GET_CODE (XEXP (x, 0)) == MULT
8024            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8025            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8026            && CONST_INT_P (XEXP (x, 1)))
8027     {
8028       type = ADDRESS_REG_UXTW;
8029       index = XEXP (XEXP (x, 0), 0);
8030       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
8031       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
8032         shift = -1;
8033     }
8034   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
8035   else if ((GET_CODE (x) == SIGN_EXTRACT
8036             || GET_CODE (x) == ZERO_EXTRACT)
8037            && GET_MODE (x) == DImode
8038            && GET_CODE (XEXP (x, 0)) == ASHIFT
8039            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8040            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
8041     {
8042       type = (GET_CODE (x) == SIGN_EXTRACT)
8043         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8044       index = XEXP (XEXP (x, 0), 0);
8045       shift = INTVAL (XEXP (XEXP (x, 0), 1));
8046       if (INTVAL (XEXP (x, 1)) != 32 + shift
8047           || INTVAL (XEXP (x, 2)) != 0)
8048         shift = -1;
8049     }
8050   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
8051      (const_int 0xffffffff<<shift)) */
8052   else if (GET_CODE (x) == AND
8053            && GET_MODE (x) == DImode
8054            && GET_CODE (XEXP (x, 0)) == ASHIFT
8055            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8056            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8057            && CONST_INT_P (XEXP (x, 1)))
8058     {
8059       type = ADDRESS_REG_UXTW;
8060       index = XEXP (XEXP (x, 0), 0);
8061       shift = INTVAL (XEXP (XEXP (x, 0), 1));
8062       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
8063         shift = -1;
8064     }
8065   /* (mult:P (reg:P) (const_int scale)) */
8066   else if (GET_CODE (x) == MULT
8067            && GET_MODE (x) == Pmode
8068            && GET_MODE (XEXP (x, 0)) == Pmode
8069            && CONST_INT_P (XEXP (x, 1)))
8070     {
8071       type = ADDRESS_REG_REG;
8072       index = XEXP (x, 0);
8073       shift = exact_log2 (INTVAL (XEXP (x, 1)));
8074     }
8075   /* (ashift:P (reg:P) (const_int shift)) */
8076   else if (GET_CODE (x) == ASHIFT
8077            && GET_MODE (x) == Pmode
8078            && GET_MODE (XEXP (x, 0)) == Pmode
8079            && CONST_INT_P (XEXP (x, 1)))
8080     {
8081       type = ADDRESS_REG_REG;
8082       index = XEXP (x, 0);
8083       shift = INTVAL (XEXP (x, 1));
8084     }
8085   else
8086     return false;
8087
8088   if (!strict_p
8089       && GET_CODE (index) == SUBREG
8090       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
8091     index = SUBREG_REG (index);
8092
8093   if (aarch64_sve_data_mode_p (mode))
8094     {
8095       if (type != ADDRESS_REG_REG
8096           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
8097         return false;
8098     }
8099   else
8100     {
8101       if (shift != 0
8102           && !(IN_RANGE (shift, 1, 3)
8103                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
8104         return false;
8105     }
8106
8107   if (REG_P (index)
8108       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
8109     {
8110       info->type = type;
8111       info->offset = index;
8112       info->shift = shift;
8113       return true;
8114     }
8115
8116   return false;
8117 }
8118
8119 /* Return true if MODE is one of the modes for which we
8120    support LDP/STP operations.  */
8121
8122 static bool
8123 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
8124 {
8125   return mode == SImode || mode == DImode
8126          || mode == SFmode || mode == DFmode
8127          || (aarch64_vector_mode_supported_p (mode)
8128              && (known_eq (GET_MODE_SIZE (mode), 8)
8129                  || (known_eq (GET_MODE_SIZE (mode), 16)
8130                     && (aarch64_tune_params.extra_tuning_flags
8131                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
8132 }
8133
8134 /* Return true if REGNO is a virtual pointer register, or an eliminable
8135    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
8136    include stack_pointer or hard_frame_pointer.  */
8137 static bool
8138 virt_or_elim_regno_p (unsigned regno)
8139 {
8140   return ((regno >= FIRST_VIRTUAL_REGISTER
8141            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
8142           || regno == FRAME_POINTER_REGNUM
8143           || regno == ARG_POINTER_REGNUM);
8144 }
8145
8146 /* Return true if X is a valid address of type TYPE for machine mode MODE.
8147    If it is, fill in INFO appropriately.  STRICT_P is true if
8148    REG_OK_STRICT is in effect.  */
8149
8150 bool
8151 aarch64_classify_address (struct aarch64_address_info *info,
8152                           rtx x, machine_mode mode, bool strict_p,
8153                           aarch64_addr_query_type type)
8154 {
8155   enum rtx_code code = GET_CODE (x);
8156   rtx op0, op1;
8157   poly_int64 offset;
8158
8159   HOST_WIDE_INT const_size;
8160
8161   /* Whether a vector mode is partial doesn't affect address legitimacy.
8162      Partial vectors like VNx8QImode allow the same indexed addressing
8163      mode and MUL VL addressing mode as full vectors like VNx16QImode;
8164      in both cases, MUL VL counts multiples of GET_MODE_SIZE.  */
8165   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
8166   vec_flags &= ~VEC_PARTIAL;
8167
8168   /* On BE, we use load/store pair for all large int mode load/stores.
8169      TI/TFmode may also use a load/store pair.  */
8170   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
8171   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
8172                             || type == ADDR_QUERY_LDP_STP_N
8173                             || mode == TImode
8174                             || mode == TFmode
8175                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
8176
8177   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
8178      corresponds to the actual size of the memory being loaded/stored and the
8179      mode of the corresponding addressing mode is half of that.  */
8180   if (type == ADDR_QUERY_LDP_STP_N
8181       && known_eq (GET_MODE_SIZE (mode), 16))
8182     mode = DFmode;
8183
8184   bool allow_reg_index_p = (!load_store_pair_p
8185                             && (known_lt (GET_MODE_SIZE (mode), 16)
8186                                 || vec_flags == VEC_ADVSIMD
8187                                 || vec_flags & VEC_SVE_DATA));
8188
8189   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
8190      [Rn, #offset, MUL VL].  */
8191   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
8192       && (code != REG && code != PLUS))
8193     return false;
8194
8195   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
8196      REG addressing.  */
8197   if (advsimd_struct_p
8198       && !BYTES_BIG_ENDIAN
8199       && (code != POST_INC && code != REG))
8200     return false;
8201
8202   gcc_checking_assert (GET_MODE (x) == VOIDmode
8203                        || SCALAR_INT_MODE_P (GET_MODE (x)));
8204
8205   switch (code)
8206     {
8207     case REG:
8208     case SUBREG:
8209       info->type = ADDRESS_REG_IMM;
8210       info->base = x;
8211       info->offset = const0_rtx;
8212       info->const_offset = 0;
8213       return aarch64_base_register_rtx_p (x, strict_p);
8214
8215     case PLUS:
8216       op0 = XEXP (x, 0);
8217       op1 = XEXP (x, 1);
8218
8219       if (! strict_p
8220           && REG_P (op0)
8221           && virt_or_elim_regno_p (REGNO (op0))
8222           && poly_int_rtx_p (op1, &offset))
8223         {
8224           info->type = ADDRESS_REG_IMM;
8225           info->base = op0;
8226           info->offset = op1;
8227           info->const_offset = offset;
8228
8229           return true;
8230         }
8231
8232       if (maybe_ne (GET_MODE_SIZE (mode), 0)
8233           && aarch64_base_register_rtx_p (op0, strict_p)
8234           && poly_int_rtx_p (op1, &offset))
8235         {
8236           info->type = ADDRESS_REG_IMM;
8237           info->base = op0;
8238           info->offset = op1;
8239           info->const_offset = offset;
8240
8241           /* TImode and TFmode values are allowed in both pairs of X
8242              registers and individual Q registers.  The available
8243              address modes are:
8244              X,X: 7-bit signed scaled offset
8245              Q:   9-bit signed offset
8246              We conservatively require an offset representable in either mode.
8247              When performing the check for pairs of X registers i.e.  LDP/STP
8248              pass down DImode since that is the natural size of the LDP/STP
8249              instruction memory accesses.  */
8250           if (mode == TImode || mode == TFmode)
8251             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
8252                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
8253                         || offset_12bit_unsigned_scaled_p (mode, offset)));
8254
8255           /* A 7bit offset check because OImode will emit a ldp/stp
8256              instruction (only big endian will get here).
8257              For ldp/stp instructions, the offset is scaled for the size of a
8258              single element of the pair.  */
8259           if (mode == OImode)
8260             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
8261
8262           /* Three 9/12 bit offsets checks because CImode will emit three
8263              ldr/str instructions (only big endian will get here).  */
8264           if (mode == CImode)
8265             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
8266                     && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
8267                                                                offset + 32)
8268                         || offset_12bit_unsigned_scaled_p (V16QImode,
8269                                                            offset + 32)));
8270
8271           /* Two 7bit offsets checks because XImode will emit two ldp/stp
8272              instructions (only big endian will get here).  */
8273           if (mode == XImode)
8274             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
8275                     && aarch64_offset_7bit_signed_scaled_p (TImode,
8276                                                             offset + 32));
8277
8278           /* Make "m" use the LD1 offset range for SVE data modes, so
8279              that pre-RTL optimizers like ivopts will work to that
8280              instead of the wider LDR/STR range.  */
8281           if (vec_flags == VEC_SVE_DATA)
8282             return (type == ADDR_QUERY_M
8283                     ? offset_4bit_signed_scaled_p (mode, offset)
8284                     : offset_9bit_signed_scaled_p (mode, offset));
8285
8286           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
8287             {
8288               poly_int64 end_offset = (offset
8289                                        + GET_MODE_SIZE (mode)
8290                                        - BYTES_PER_SVE_VECTOR);
8291               return (type == ADDR_QUERY_M
8292                       ? offset_4bit_signed_scaled_p (mode, offset)
8293                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
8294                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
8295                                                          end_offset)));
8296             }
8297
8298           if (vec_flags == VEC_SVE_PRED)
8299             return offset_9bit_signed_scaled_p (mode, offset);
8300
8301           if (load_store_pair_p)
8302             return ((known_eq (GET_MODE_SIZE (mode), 4)
8303                      || known_eq (GET_MODE_SIZE (mode), 8)
8304                      || known_eq (GET_MODE_SIZE (mode), 16))
8305                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
8306           else
8307             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
8308                     || offset_12bit_unsigned_scaled_p (mode, offset));
8309         }
8310
8311       if (allow_reg_index_p)
8312         {
8313           /* Look for base + (scaled/extended) index register.  */
8314           if (aarch64_base_register_rtx_p (op0, strict_p)
8315               && aarch64_classify_index (info, op1, mode, strict_p))
8316             {
8317               info->base = op0;
8318               return true;
8319             }
8320           if (aarch64_base_register_rtx_p (op1, strict_p)
8321               && aarch64_classify_index (info, op0, mode, strict_p))
8322             {
8323               info->base = op1;
8324               return true;
8325             }
8326         }
8327
8328       return false;
8329
8330     case POST_INC:
8331     case POST_DEC:
8332     case PRE_INC:
8333     case PRE_DEC:
8334       info->type = ADDRESS_REG_WB;
8335       info->base = XEXP (x, 0);
8336       info->offset = NULL_RTX;
8337       return aarch64_base_register_rtx_p (info->base, strict_p);
8338
8339     case POST_MODIFY:
8340     case PRE_MODIFY:
8341       info->type = ADDRESS_REG_WB;
8342       info->base = XEXP (x, 0);
8343       if (GET_CODE (XEXP (x, 1)) == PLUS
8344           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
8345           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
8346           && aarch64_base_register_rtx_p (info->base, strict_p))
8347         {
8348           info->offset = XEXP (XEXP (x, 1), 1);
8349           info->const_offset = offset;
8350
8351           /* TImode and TFmode values are allowed in both pairs of X
8352              registers and individual Q registers.  The available
8353              address modes are:
8354              X,X: 7-bit signed scaled offset
8355              Q:   9-bit signed offset
8356              We conservatively require an offset representable in either mode.
8357            */
8358           if (mode == TImode || mode == TFmode)
8359             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
8360                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
8361
8362           if (load_store_pair_p)
8363             return ((known_eq (GET_MODE_SIZE (mode), 4)
8364                      || known_eq (GET_MODE_SIZE (mode), 8)
8365                      || known_eq (GET_MODE_SIZE (mode), 16))
8366                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
8367           else
8368             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
8369         }
8370       return false;
8371
8372     case CONST:
8373     case SYMBOL_REF:
8374     case LABEL_REF:
8375       /* load literal: pc-relative constant pool entry.  Only supported
8376          for SI mode or larger.  */
8377       info->type = ADDRESS_SYMBOLIC;
8378
8379       if (!load_store_pair_p
8380           && GET_MODE_SIZE (mode).is_constant (&const_size)
8381           && const_size >= 4)
8382         {
8383           rtx sym, addend;
8384
8385           split_const (x, &sym, &addend);
8386           return ((GET_CODE (sym) == LABEL_REF
8387                    || (GET_CODE (sym) == SYMBOL_REF
8388                        && CONSTANT_POOL_ADDRESS_P (sym)
8389                        && aarch64_pcrelative_literal_loads)));
8390         }
8391       return false;
8392
8393     case LO_SUM:
8394       info->type = ADDRESS_LO_SUM;
8395       info->base = XEXP (x, 0);
8396       info->offset = XEXP (x, 1);
8397       if (allow_reg_index_p
8398           && aarch64_base_register_rtx_p (info->base, strict_p))
8399         {
8400           rtx sym, offs;
8401           split_const (info->offset, &sym, &offs);
8402           if (GET_CODE (sym) == SYMBOL_REF
8403               && (aarch64_classify_symbol (sym, INTVAL (offs))
8404                   == SYMBOL_SMALL_ABSOLUTE))
8405             {
8406               /* The symbol and offset must be aligned to the access size.  */
8407               unsigned int align;
8408
8409               if (CONSTANT_POOL_ADDRESS_P (sym))
8410                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
8411               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
8412                 {
8413                   tree exp = SYMBOL_REF_DECL (sym);
8414                   align = TYPE_ALIGN (TREE_TYPE (exp));
8415                   align = aarch64_constant_alignment (exp, align);
8416                 }
8417               else if (SYMBOL_REF_DECL (sym))
8418                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
8419               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
8420                        && SYMBOL_REF_BLOCK (sym) != NULL)
8421                 align = SYMBOL_REF_BLOCK (sym)->alignment;
8422               else
8423                 align = BITS_PER_UNIT;
8424
8425               poly_int64 ref_size = GET_MODE_SIZE (mode);
8426               if (known_eq (ref_size, 0))
8427                 ref_size = GET_MODE_SIZE (DImode);
8428
8429               return (multiple_p (INTVAL (offs), ref_size)
8430                       && multiple_p (align / BITS_PER_UNIT, ref_size));
8431             }
8432         }
8433       return false;
8434
8435     default:
8436       return false;
8437     }
8438 }
8439
8440 /* Return true if the address X is valid for a PRFM instruction.
8441    STRICT_P is true if we should do strict checking with
8442    aarch64_classify_address.  */
8443
8444 bool
8445 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
8446 {
8447   struct aarch64_address_info addr;
8448
8449   /* PRFM accepts the same addresses as DImode...  */
8450   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
8451   if (!res)
8452     return false;
8453
8454   /* ... except writeback forms.  */
8455   return addr.type != ADDRESS_REG_WB;
8456 }
8457
8458 bool
8459 aarch64_symbolic_address_p (rtx x)
8460 {
8461   rtx offset;
8462
8463   split_const (x, &x, &offset);
8464   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
8465 }
8466
8467 /* Classify the base of symbolic expression X.  */
8468
8469 enum aarch64_symbol_type
8470 aarch64_classify_symbolic_expression (rtx x)
8471 {
8472   rtx offset;
8473
8474   split_const (x, &x, &offset);
8475   return aarch64_classify_symbol (x, INTVAL (offset));
8476 }
8477
8478
8479 /* Return TRUE if X is a legitimate address for accessing memory in
8480    mode MODE.  */
8481 static bool
8482 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
8483 {
8484   struct aarch64_address_info addr;
8485
8486   return aarch64_classify_address (&addr, x, mode, strict_p);
8487 }
8488
8489 /* Return TRUE if X is a legitimate address of type TYPE for accessing
8490    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
8491 bool
8492 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
8493                               aarch64_addr_query_type type)
8494 {
8495   struct aarch64_address_info addr;
8496
8497   return aarch64_classify_address (&addr, x, mode, strict_p, type);
8498 }
8499
8500 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
8501
8502 static bool
8503 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
8504                                          poly_int64 orig_offset,
8505                                          machine_mode mode)
8506 {
8507   HOST_WIDE_INT size;
8508   if (GET_MODE_SIZE (mode).is_constant (&size))
8509     {
8510       HOST_WIDE_INT const_offset, second_offset;
8511
8512       /* A general SVE offset is A * VQ + B.  Remove the A component from
8513          coefficient 0 in order to get the constant B.  */
8514       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
8515
8516       /* Split an out-of-range address displacement into a base and
8517          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
8518          range otherwise to increase opportunities for sharing the base
8519          address of different sizes.  Unaligned accesses use the signed
8520          9-bit range, TImode/TFmode use the intersection of signed
8521          scaled 7-bit and signed 9-bit offset.  */
8522       if (mode == TImode || mode == TFmode)
8523         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
8524       else if ((const_offset & (size - 1)) != 0)
8525         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
8526       else
8527         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
8528
8529       if (second_offset == 0 || known_eq (orig_offset, second_offset))
8530         return false;
8531
8532       /* Split the offset into second_offset and the rest.  */
8533       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
8534       *offset2 = gen_int_mode (second_offset, Pmode);
8535       return true;
8536     }
8537   else
8538     {
8539       /* Get the mode we should use as the basis of the range.  For structure
8540          modes this is the mode of one vector.  */
8541       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
8542       machine_mode step_mode
8543         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
8544
8545       /* Get the "mul vl" multiplier we'd like to use.  */
8546       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
8547       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
8548       if (vec_flags & VEC_SVE_DATA)
8549         /* LDR supports a 9-bit range, but the move patterns for
8550            structure modes require all vectors to be in range of the
8551            same base.  The simplest way of accomodating that while still
8552            promoting reuse of anchor points between different modes is
8553            to use an 8-bit range unconditionally.  */
8554         vnum = ((vnum + 128) & 255) - 128;
8555       else
8556         /* Predicates are only handled singly, so we might as well use
8557            the full range.  */
8558         vnum = ((vnum + 256) & 511) - 256;
8559       if (vnum == 0)
8560         return false;
8561
8562       /* Convert the "mul vl" multiplier into a byte offset.  */
8563       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
8564       if (known_eq (second_offset, orig_offset))
8565         return false;
8566
8567       /* Split the offset into second_offset and the rest.  */
8568       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
8569       *offset2 = gen_int_mode (second_offset, Pmode);
8570       return true;
8571     }
8572 }
8573
8574 /* Return the binary representation of floating point constant VALUE in INTVAL.
8575    If the value cannot be converted, return false without setting INTVAL.
8576    The conversion is done in the given MODE.  */
8577 bool
8578 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
8579 {
8580
8581   /* We make a general exception for 0.  */
8582   if (aarch64_float_const_zero_rtx_p (value))
8583     {
8584       *intval = 0;
8585       return true;
8586     }
8587
8588   scalar_float_mode mode;
8589   if (GET_CODE (value) != CONST_DOUBLE
8590       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
8591       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
8592       /* Only support up to DF mode.  */
8593       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
8594     return false;
8595
8596   unsigned HOST_WIDE_INT ival = 0;
8597
8598   long res[2];
8599   real_to_target (res,
8600                   CONST_DOUBLE_REAL_VALUE (value),
8601                   REAL_MODE_FORMAT (mode));
8602
8603   if (mode == DFmode)
8604     {
8605       int order = BYTES_BIG_ENDIAN ? 1 : 0;
8606       ival = zext_hwi (res[order], 32);
8607       ival |= (zext_hwi (res[1 - order], 32) << 32);
8608     }
8609   else
8610       ival = zext_hwi (res[0], 32);
8611
8612   *intval = ival;
8613   return true;
8614 }
8615
8616 /* Return TRUE if rtx X is an immediate constant that can be moved using a
8617    single MOV(+MOVK) followed by an FMOV.  */
8618 bool
8619 aarch64_float_const_rtx_p (rtx x)
8620 {
8621   machine_mode mode = GET_MODE (x);
8622   if (mode == VOIDmode)
8623     return false;
8624
8625   /* Determine whether it's cheaper to write float constants as
8626      mov/movk pairs over ldr/adrp pairs.  */
8627   unsigned HOST_WIDE_INT ival;
8628
8629   if (GET_CODE (x) == CONST_DOUBLE
8630       && SCALAR_FLOAT_MODE_P (mode)
8631       && aarch64_reinterpret_float_as_int (x, &ival))
8632     {
8633       scalar_int_mode imode = (mode == HFmode
8634                                ? SImode
8635                                : int_mode_for_mode (mode).require ());
8636       int num_instr = aarch64_internal_mov_immediate
8637                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8638       return num_instr < 3;
8639     }
8640
8641   return false;
8642 }
8643
8644 /* Return TRUE if rtx X is immediate constant 0.0 */
8645 bool
8646 aarch64_float_const_zero_rtx_p (rtx x)
8647 {
8648   if (GET_MODE (x) == VOIDmode)
8649     return false;
8650
8651   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
8652     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
8653   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
8654 }
8655
8656 /* Return TRUE if rtx X is immediate constant that fits in a single
8657    MOVI immediate operation.  */
8658 bool
8659 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
8660 {
8661   if (!TARGET_SIMD)
8662      return false;
8663
8664   machine_mode vmode;
8665   scalar_int_mode imode;
8666   unsigned HOST_WIDE_INT ival;
8667
8668   if (GET_CODE (x) == CONST_DOUBLE
8669       && SCALAR_FLOAT_MODE_P (mode))
8670     {
8671       if (!aarch64_reinterpret_float_as_int (x, &ival))
8672         return false;
8673
8674       /* We make a general exception for 0.  */
8675       if (aarch64_float_const_zero_rtx_p (x))
8676         return true;
8677
8678       imode = int_mode_for_mode (mode).require ();
8679     }
8680   else if (GET_CODE (x) == CONST_INT
8681            && is_a <scalar_int_mode> (mode, &imode))
8682     ival = INTVAL (x);
8683   else
8684     return false;
8685
8686    /* use a 64 bit mode for everything except for DI/DF mode, where we use
8687      a 128 bit vector mode.  */
8688   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
8689
8690   vmode = aarch64_simd_container_mode (imode, width);
8691   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
8692
8693   return aarch64_simd_valid_immediate (v_op, NULL);
8694 }
8695
8696
8697 /* Return the fixed registers used for condition codes.  */
8698
8699 static bool
8700 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
8701 {
8702   *p1 = CC_REGNUM;
8703   *p2 = INVALID_REGNUM;
8704   return true;
8705 }
8706
8707 /* This function is used by the call expanders of the machine description.
8708    RESULT is the register in which the result is returned.  It's NULL for
8709    "call" and "sibcall".
8710    MEM is the location of the function call.
8711    CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
8712    SIBCALL indicates whether this function call is normal call or sibling call.
8713    It will generate different pattern accordingly.  */
8714
8715 void
8716 aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
8717 {
8718   rtx call, callee, tmp;
8719   rtvec vec;
8720   machine_mode mode;
8721
8722   gcc_assert (MEM_P (mem));
8723   callee = XEXP (mem, 0);
8724   mode = GET_MODE (callee);
8725   gcc_assert (mode == Pmode);
8726
8727   /* Decide if we should generate indirect calls by loading the
8728      address of the callee into a register before performing
8729      the branch-and-link.  */
8730   if (SYMBOL_REF_P (callee)
8731       ? (aarch64_is_long_call_p (callee)
8732          || aarch64_is_noplt_call_p (callee))
8733       : !REG_P (callee))
8734     XEXP (mem, 0) = force_reg (mode, callee);
8735
8736   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
8737
8738   if (result != NULL_RTX)
8739     call = gen_rtx_SET (result, call);
8740
8741   if (sibcall)
8742     tmp = ret_rtx;
8743   else
8744     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
8745
8746   gcc_assert (CONST_INT_P (callee_abi));
8747   callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
8748                                UNSPEC_CALLEE_ABI);
8749
8750   vec = gen_rtvec (3, call, callee_abi, tmp);
8751   call = gen_rtx_PARALLEL (VOIDmode, vec);
8752
8753   aarch64_emit_call_insn (call);
8754 }
8755
8756 /* Emit call insn with PAT and do aarch64-specific handling.  */
8757
8758 void
8759 aarch64_emit_call_insn (rtx pat)
8760 {
8761   rtx insn = emit_call_insn (pat);
8762
8763   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
8764   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
8765   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
8766 }
8767
8768 machine_mode
8769 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
8770 {
8771   machine_mode mode_x = GET_MODE (x);
8772   rtx_code code_x = GET_CODE (x);
8773
8774   /* All floating point compares return CCFP if it is an equality
8775      comparison, and CCFPE otherwise.  */
8776   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
8777     {
8778       switch (code)
8779         {
8780         case EQ:
8781         case NE:
8782         case UNORDERED:
8783         case ORDERED:
8784         case UNLT:
8785         case UNLE:
8786         case UNGT:
8787         case UNGE:
8788         case UNEQ:
8789           return CCFPmode;
8790
8791         case LT:
8792         case LE:
8793         case GT:
8794         case GE:
8795         case LTGT:
8796           return CCFPEmode;
8797
8798         default:
8799           gcc_unreachable ();
8800         }
8801     }
8802
8803   /* Equality comparisons of short modes against zero can be performed
8804      using the TST instruction with the appropriate bitmask.  */
8805   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
8806       && (code == EQ || code == NE)
8807       && (mode_x == HImode || mode_x == QImode))
8808     return CC_NZmode;
8809
8810   /* Similarly, comparisons of zero_extends from shorter modes can
8811      be performed using an ANDS with an immediate mask.  */
8812   if (y == const0_rtx && code_x == ZERO_EXTEND
8813       && (mode_x == SImode || mode_x == DImode)
8814       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
8815       && (code == EQ || code == NE))
8816     return CC_NZmode;
8817
8818   if ((mode_x == SImode || mode_x == DImode)
8819       && y == const0_rtx
8820       && (code == EQ || code == NE || code == LT || code == GE)
8821       && (code_x == PLUS || code_x == MINUS || code_x == AND
8822           || code_x == NEG
8823           || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
8824               && CONST_INT_P (XEXP (x, 2)))))
8825     return CC_NZmode;
8826
8827   /* A compare with a shifted operand.  Because of canonicalization,
8828      the comparison will have to be swapped when we emit the assembly
8829      code.  */
8830   if ((mode_x == SImode || mode_x == DImode)
8831       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
8832       && (code_x == ASHIFT || code_x == ASHIFTRT
8833           || code_x == LSHIFTRT
8834           || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
8835     return CC_SWPmode;
8836
8837   /* Similarly for a negated operand, but we can only do this for
8838      equalities.  */
8839   if ((mode_x == SImode || mode_x == DImode)
8840       && (REG_P (y) || GET_CODE (y) == SUBREG)
8841       && (code == EQ || code == NE)
8842       && code_x == NEG)
8843     return CC_Zmode;
8844
8845   /* A test for unsigned overflow from an addition.  */
8846   if ((mode_x == DImode || mode_x == TImode)
8847       && (code == LTU || code == GEU)
8848       && code_x == PLUS
8849       && rtx_equal_p (XEXP (x, 0), y))
8850     return CC_Cmode;
8851
8852   /* A test for unsigned overflow from an add with carry.  */
8853   if ((mode_x == DImode || mode_x == TImode)
8854       && (code == LTU || code == GEU)
8855       && code_x == PLUS
8856       && CONST_SCALAR_INT_P (y)
8857       && (rtx_mode_t (y, mode_x)
8858           == (wi::shwi (1, mode_x)
8859               << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
8860     return CC_ADCmode;
8861
8862   /* A test for signed overflow.  */
8863   if ((mode_x == DImode || mode_x == TImode)
8864       && code == NE
8865       && code_x == PLUS
8866       && GET_CODE (y) == SIGN_EXTEND)
8867     return CC_Vmode;
8868
8869   /* For everything else, return CCmode.  */
8870   return CCmode;
8871 }
8872
8873 static int
8874 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
8875
8876 int
8877 aarch64_get_condition_code (rtx x)
8878 {
8879   machine_mode mode = GET_MODE (XEXP (x, 0));
8880   enum rtx_code comp_code = GET_CODE (x);
8881
8882   if (GET_MODE_CLASS (mode) != MODE_CC)
8883     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
8884   return aarch64_get_condition_code_1 (mode, comp_code);
8885 }
8886
8887 static int
8888 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
8889 {
8890   switch (mode)
8891     {
8892     case E_CCFPmode:
8893     case E_CCFPEmode:
8894       switch (comp_code)
8895         {
8896         case GE: return AARCH64_GE;
8897         case GT: return AARCH64_GT;
8898         case LE: return AARCH64_LS;
8899         case LT: return AARCH64_MI;
8900         case NE: return AARCH64_NE;
8901         case EQ: return AARCH64_EQ;
8902         case ORDERED: return AARCH64_VC;
8903         case UNORDERED: return AARCH64_VS;
8904         case UNLT: return AARCH64_LT;
8905         case UNLE: return AARCH64_LE;
8906         case UNGT: return AARCH64_HI;
8907         case UNGE: return AARCH64_PL;
8908         default: return -1;
8909         }
8910       break;
8911
8912     case E_CCmode:
8913       switch (comp_code)
8914         {
8915         case NE: return AARCH64_NE;
8916         case EQ: return AARCH64_EQ;
8917         case GE: return AARCH64_GE;
8918         case GT: return AARCH64_GT;
8919         case LE: return AARCH64_LE;
8920         case LT: return AARCH64_LT;
8921         case GEU: return AARCH64_CS;
8922         case GTU: return AARCH64_HI;
8923         case LEU: return AARCH64_LS;
8924         case LTU: return AARCH64_CC;
8925         default: return -1;
8926         }
8927       break;
8928
8929     case E_CC_SWPmode:
8930       switch (comp_code)
8931         {
8932         case NE: return AARCH64_NE;
8933         case EQ: return AARCH64_EQ;
8934         case GE: return AARCH64_LE;
8935         case GT: return AARCH64_LT;
8936         case LE: return AARCH64_GE;
8937         case LT: return AARCH64_GT;
8938         case GEU: return AARCH64_LS;
8939         case GTU: return AARCH64_CC;
8940         case LEU: return AARCH64_CS;
8941         case LTU: return AARCH64_HI;
8942         default: return -1;
8943         }
8944       break;
8945
8946     case E_CC_NZCmode:
8947       switch (comp_code)
8948         {
8949         case NE: return AARCH64_NE; /* = any */
8950         case EQ: return AARCH64_EQ; /* = none */
8951         case GE: return AARCH64_PL; /* = nfrst */
8952         case LT: return AARCH64_MI; /* = first */
8953         case GEU: return AARCH64_CS; /* = nlast */
8954         case GTU: return AARCH64_HI; /* = pmore */
8955         case LEU: return AARCH64_LS; /* = plast */
8956         case LTU: return AARCH64_CC; /* = last */
8957         default: return -1;
8958         }
8959       break;
8960
8961     case E_CC_NZmode:
8962       switch (comp_code)
8963         {
8964         case NE: return AARCH64_NE;
8965         case EQ: return AARCH64_EQ;
8966         case GE: return AARCH64_PL;
8967         case LT: return AARCH64_MI;
8968         default: return -1;
8969         }
8970       break;
8971
8972     case E_CC_Zmode:
8973       switch (comp_code)
8974         {
8975         case NE: return AARCH64_NE;
8976         case EQ: return AARCH64_EQ;
8977         default: return -1;
8978         }
8979       break;
8980
8981     case E_CC_Cmode:
8982       switch (comp_code)
8983         {
8984         case LTU: return AARCH64_CS;
8985         case GEU: return AARCH64_CC;
8986         default: return -1;
8987         }
8988       break;
8989
8990     case E_CC_ADCmode:
8991       switch (comp_code)
8992         {
8993         case GEU: return AARCH64_CS;
8994         case LTU: return AARCH64_CC;
8995         default: return -1;
8996         }
8997       break;
8998
8999     case E_CC_Vmode:
9000       switch (comp_code)
9001         {
9002         case NE: return AARCH64_VS;
9003         case EQ: return AARCH64_VC;
9004         default: return -1;
9005         }
9006       break;
9007
9008     default:
9009       return -1;
9010     }
9011
9012   return -1;
9013 }
9014
9015 bool
9016 aarch64_const_vec_all_same_in_range_p (rtx x,
9017                                        HOST_WIDE_INT minval,
9018                                        HOST_WIDE_INT maxval)
9019 {
9020   rtx elt;
9021   return (const_vec_duplicate_p (x, &elt)
9022           && CONST_INT_P (elt)
9023           && IN_RANGE (INTVAL (elt), minval, maxval));
9024 }
9025
9026 bool
9027 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
9028 {
9029   return aarch64_const_vec_all_same_in_range_p (x, val, val);
9030 }
9031
9032 /* Return true if VEC is a constant in which every element is in the range
9033    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
9034
9035 static bool
9036 aarch64_const_vec_all_in_range_p (rtx vec,
9037                                   HOST_WIDE_INT minval,
9038                                   HOST_WIDE_INT maxval)
9039 {
9040   if (GET_CODE (vec) != CONST_VECTOR
9041       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
9042     return false;
9043
9044   int nunits;
9045   if (!CONST_VECTOR_STEPPED_P (vec))
9046     nunits = const_vector_encoded_nelts (vec);
9047   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
9048     return false;
9049
9050   for (int i = 0; i < nunits; i++)
9051     {
9052       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
9053       if (!CONST_INT_P (vec_elem)
9054           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
9055         return false;
9056     }
9057   return true;
9058 }
9059
9060 /* N Z C V.  */
9061 #define AARCH64_CC_V 1
9062 #define AARCH64_CC_C (1 << 1)
9063 #define AARCH64_CC_Z (1 << 2)
9064 #define AARCH64_CC_N (1 << 3)
9065
9066 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
9067 static const int aarch64_nzcv_codes[] =
9068 {
9069   0,            /* EQ, Z == 1.  */
9070   AARCH64_CC_Z, /* NE, Z == 0.  */
9071   0,            /* CS, C == 1.  */
9072   AARCH64_CC_C, /* CC, C == 0.  */
9073   0,            /* MI, N == 1.  */
9074   AARCH64_CC_N, /* PL, N == 0.  */
9075   0,            /* VS, V == 1.  */
9076   AARCH64_CC_V, /* VC, V == 0.  */
9077   0,            /* HI, C ==1 && Z == 0.  */
9078   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
9079   AARCH64_CC_V, /* GE, N == V.  */
9080   0,            /* LT, N != V.  */
9081   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
9082   0,            /* LE, !(Z == 0 && N == V).  */
9083   0,            /* AL, Any.  */
9084   0             /* NV, Any.  */
9085 };
9086
9087 /* Print floating-point vector immediate operand X to F, negating it
9088    first if NEGATE is true.  Return true on success, false if it isn't
9089    a constant we can handle.  */
9090
9091 static bool
9092 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
9093 {
9094   rtx elt;
9095
9096   if (!const_vec_duplicate_p (x, &elt))
9097     return false;
9098
9099   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
9100   if (negate)
9101     r = real_value_negate (&r);
9102
9103   /* Handle the SVE single-bit immediates specially, since they have a
9104      fixed form in the assembly syntax.  */
9105   if (real_equal (&r, &dconst0))
9106     asm_fprintf (f, "0.0");
9107   else if (real_equal (&r, &dconst2))
9108     asm_fprintf (f, "2.0");
9109   else if (real_equal (&r, &dconst1))
9110     asm_fprintf (f, "1.0");
9111   else if (real_equal (&r, &dconsthalf))
9112     asm_fprintf (f, "0.5");
9113   else
9114     {
9115       const int buf_size = 20;
9116       char float_buf[buf_size] = {'\0'};
9117       real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
9118                                 1, GET_MODE (elt));
9119       asm_fprintf (f, "%s", float_buf);
9120     }
9121
9122   return true;
9123 }
9124
9125 /* Return the equivalent letter for size.  */
9126 static char
9127 sizetochar (int size)
9128 {
9129   switch (size)
9130     {
9131     case 64: return 'd';
9132     case 32: return 's';
9133     case 16: return 'h';
9134     case 8 : return 'b';
9135     default: gcc_unreachable ();
9136     }
9137 }
9138
9139 /* Print operand X to file F in a target specific manner according to CODE.
9140    The acceptable formatting commands given by CODE are:
9141      'c':               An integer or symbol address without a preceding #
9142                         sign.
9143      'C':               Take the duplicated element in a vector constant
9144                         and print it in hex.
9145      'D':               Take the duplicated element in a vector constant
9146                         and print it as an unsigned integer, in decimal.
9147      'e':               Print the sign/zero-extend size as a character 8->b,
9148                         16->h, 32->w.  Can also be used for masks:
9149                         0xff->b, 0xffff->h, 0xffffffff->w.
9150      'I':               If the operand is a duplicated vector constant,
9151                         replace it with the duplicated scalar.  If the
9152                         operand is then a floating-point constant, replace
9153                         it with the integer bit representation.  Print the
9154                         transformed constant as a signed decimal number.
9155      'p':               Prints N such that 2^N == X (X must be power of 2 and
9156                         const int).
9157      'P':               Print the number of non-zero bits in X (a const_int).
9158      'H':               Print the higher numbered register of a pair (TImode)
9159                         of regs.
9160      'm':               Print a condition (eq, ne, etc).
9161      'M':               Same as 'm', but invert condition.
9162      'N':               Take the duplicated element in a vector constant
9163                         and print the negative of it in decimal.
9164      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
9165      'S/T/U/V':         Print a FP/SIMD register name for a register list.
9166                         The register printed is the FP/SIMD register name
9167                         of X + 0/1/2/3 for S/T/U/V.
9168      'R':               Print a scalar Integer/FP/SIMD register name + 1.
9169      'X':               Print bottom 16 bits of integer constant in hex.
9170      'w/x':             Print a general register name or the zero register
9171                         (32-bit or 64-bit).
9172      '0':               Print a normal operand, if it's a general register,
9173                         then we assume DImode.
9174      'k':               Print NZCV for conditional compare instructions.
9175      'A':               Output address constant representing the first
9176                         argument of X, specifying a relocation offset
9177                         if appropriate.
9178      'L':               Output constant address specified by X
9179                         with a relocation offset if appropriate.
9180      'G':               Prints address of X, specifying a PC relative
9181                         relocation mode if appropriate.
9182      'y':               Output address of LDP or STP - this is used for
9183                         some LDP/STPs which don't use a PARALLEL in their
9184                         pattern (so the mode needs to be adjusted).
9185      'z':               Output address of a typical LDP or STP.  */
9186
9187 static void
9188 aarch64_print_operand (FILE *f, rtx x, int code)
9189 {
9190   rtx elt;
9191   switch (code)
9192     {
9193     case 'c':
9194       switch (GET_CODE (x))
9195         {
9196         case CONST_INT:
9197           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
9198           break;
9199
9200         case SYMBOL_REF:
9201           output_addr_const (f, x);
9202           break;
9203
9204         case CONST:
9205           if (GET_CODE (XEXP (x, 0)) == PLUS
9206               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
9207             {
9208               output_addr_const (f, x);
9209               break;
9210             }
9211           /* Fall through.  */
9212
9213         default:
9214           output_operand_lossage ("unsupported operand for code '%c'", code);
9215         }
9216       break;
9217
9218     case 'e':
9219       {
9220         x = unwrap_const_vec_duplicate (x);
9221         if (!CONST_INT_P (x))
9222           {
9223             output_operand_lossage ("invalid operand for '%%%c'", code);
9224             return;
9225           }
9226
9227         HOST_WIDE_INT val = INTVAL (x);
9228         if ((val & ~7) == 8 || val == 0xff)
9229           fputc ('b', f);
9230         else if ((val & ~7) == 16 || val == 0xffff)
9231           fputc ('h', f);
9232         else if ((val & ~7) == 32 || val == 0xffffffff)
9233           fputc ('w', f);
9234         else
9235           {
9236             output_operand_lossage ("invalid operand for '%%%c'", code);
9237             return;
9238           }
9239       }
9240       break;
9241
9242     case 'p':
9243       {
9244         int n;
9245
9246         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
9247           {
9248             output_operand_lossage ("invalid operand for '%%%c'", code);
9249             return;
9250           }
9251
9252         asm_fprintf (f, "%d", n);
9253       }
9254       break;
9255
9256     case 'P':
9257       if (!CONST_INT_P (x))
9258         {
9259           output_operand_lossage ("invalid operand for '%%%c'", code);
9260           return;
9261         }
9262
9263       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
9264       break;
9265
9266     case 'H':
9267       if (x == const0_rtx)
9268         {
9269           asm_fprintf (f, "xzr");
9270           break;
9271         }
9272
9273       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
9274         {
9275           output_operand_lossage ("invalid operand for '%%%c'", code);
9276           return;
9277         }
9278
9279       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
9280       break;
9281
9282     case 'I':
9283       {
9284         x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
9285         if (CONST_INT_P (x))
9286           asm_fprintf (f, "%wd", INTVAL (x));
9287         else
9288           {
9289             output_operand_lossage ("invalid operand for '%%%c'", code);
9290             return;
9291           }
9292         break;
9293       }
9294
9295     case 'M':
9296     case 'm':
9297       {
9298         int cond_code;
9299         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
9300         if (x == const_true_rtx)
9301           {
9302             if (code == 'M')
9303               fputs ("nv", f);
9304             return;
9305           }
9306
9307         if (!COMPARISON_P (x))
9308           {
9309             output_operand_lossage ("invalid operand for '%%%c'", code);
9310             return;
9311           }
9312
9313         cond_code = aarch64_get_condition_code (x);
9314         gcc_assert (cond_code >= 0);
9315         if (code == 'M')
9316           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
9317         if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
9318           fputs (aarch64_sve_condition_codes[cond_code], f);
9319         else
9320           fputs (aarch64_condition_codes[cond_code], f);
9321       }
9322       break;
9323
9324     case 'N':
9325       if (!const_vec_duplicate_p (x, &elt))
9326         {
9327           output_operand_lossage ("invalid vector constant");
9328           return;
9329         }
9330
9331       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
9332         asm_fprintf (f, "%wd", -INTVAL (elt));
9333       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
9334                && aarch64_print_vector_float_operand (f, x, true))
9335         ;
9336       else
9337         {
9338           output_operand_lossage ("invalid vector constant");
9339           return;
9340         }
9341       break;
9342
9343     case 'b':
9344     case 'h':
9345     case 's':
9346     case 'd':
9347     case 'q':
9348       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
9349         {
9350           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
9351           return;
9352         }
9353       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
9354       break;
9355
9356     case 'S':
9357     case 'T':
9358     case 'U':
9359     case 'V':
9360       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
9361         {
9362           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
9363           return;
9364         }
9365       asm_fprintf (f, "%c%d",
9366                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
9367                    REGNO (x) - V0_REGNUM + (code - 'S'));
9368       break;
9369
9370     case 'R':
9371       if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
9372         asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
9373       else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
9374         asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
9375       else
9376         output_operand_lossage ("incompatible register operand for '%%%c'",
9377                                 code);
9378       break;
9379
9380     case 'X':
9381       if (!CONST_INT_P (x))
9382         {
9383           output_operand_lossage ("invalid operand for '%%%c'", code);
9384           return;
9385         }
9386       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
9387       break;
9388
9389     case 'C':
9390       {
9391         /* Print a replicated constant in hex.  */
9392         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
9393           {
9394             output_operand_lossage ("invalid operand for '%%%c'", code);
9395             return;
9396           }
9397         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
9398         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
9399       }
9400       break;
9401
9402     case 'D':
9403       {
9404         /* Print a replicated constant in decimal, treating it as
9405            unsigned.  */
9406         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
9407           {
9408             output_operand_lossage ("invalid operand for '%%%c'", code);
9409             return;
9410           }
9411         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
9412         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
9413       }
9414       break;
9415
9416     case 'w':
9417     case 'x':
9418       if (x == const0_rtx
9419           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
9420         {
9421           asm_fprintf (f, "%czr", code);
9422           break;
9423         }
9424
9425       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
9426         {
9427           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
9428           break;
9429         }
9430
9431       if (REG_P (x) && REGNO (x) == SP_REGNUM)
9432         {
9433           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
9434           break;
9435         }
9436
9437       /* Fall through */
9438
9439     case 0:
9440       if (x == NULL)
9441         {
9442           output_operand_lossage ("missing operand");
9443           return;
9444         }
9445
9446       switch (GET_CODE (x))
9447         {
9448         case REG:
9449           if (aarch64_sve_data_mode_p (GET_MODE (x)))
9450             {
9451               if (REG_NREGS (x) == 1)
9452                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
9453               else
9454                 {
9455                   char suffix
9456                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
9457                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
9458                                REGNO (x) - V0_REGNUM, suffix,
9459                                END_REGNO (x) - V0_REGNUM - 1, suffix);
9460                 }
9461             }
9462           else
9463             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
9464           break;
9465
9466         case MEM:
9467           output_address (GET_MODE (x), XEXP (x, 0));
9468           break;
9469
9470         case LABEL_REF:
9471         case SYMBOL_REF:
9472           output_addr_const (asm_out_file, x);
9473           break;
9474
9475         case CONST_INT:
9476           asm_fprintf (f, "%wd", INTVAL (x));
9477           break;
9478
9479         case CONST:
9480           if (!VECTOR_MODE_P (GET_MODE (x)))
9481             {
9482               output_addr_const (asm_out_file, x);
9483               break;
9484             }
9485           /* fall through */
9486
9487         case CONST_VECTOR:
9488           if (!const_vec_duplicate_p (x, &elt))
9489             {
9490               output_operand_lossage ("invalid vector constant");
9491               return;
9492             }
9493
9494           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
9495             asm_fprintf (f, "%wd", INTVAL (elt));
9496           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
9497                    && aarch64_print_vector_float_operand (f, x, false))
9498             ;
9499           else
9500             {
9501               output_operand_lossage ("invalid vector constant");
9502               return;
9503             }
9504           break;
9505
9506         case CONST_DOUBLE:
9507           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
9508              be getting CONST_DOUBLEs holding integers.  */
9509           gcc_assert (GET_MODE (x) != VOIDmode);
9510           if (aarch64_float_const_zero_rtx_p (x))
9511             {
9512               fputc ('0', f);
9513               break;
9514             }
9515           else if (aarch64_float_const_representable_p (x))
9516             {
9517 #define buf_size 20
9518               char float_buf[buf_size] = {'\0'};
9519               real_to_decimal_for_mode (float_buf,
9520                                         CONST_DOUBLE_REAL_VALUE (x),
9521                                         buf_size, buf_size,
9522                                         1, GET_MODE (x));
9523               asm_fprintf (asm_out_file, "%s", float_buf);
9524               break;
9525 #undef buf_size
9526             }
9527           output_operand_lossage ("invalid constant");
9528           return;
9529         default:
9530           output_operand_lossage ("invalid operand");
9531           return;
9532         }
9533       break;
9534
9535     case 'A':
9536       if (GET_CODE (x) == HIGH)
9537         x = XEXP (x, 0);
9538
9539       switch (aarch64_classify_symbolic_expression (x))
9540         {
9541         case SYMBOL_SMALL_GOT_4G:
9542           asm_fprintf (asm_out_file, ":got:");
9543           break;
9544
9545         case SYMBOL_SMALL_TLSGD:
9546           asm_fprintf (asm_out_file, ":tlsgd:");
9547           break;
9548
9549         case SYMBOL_SMALL_TLSDESC:
9550           asm_fprintf (asm_out_file, ":tlsdesc:");
9551           break;
9552
9553         case SYMBOL_SMALL_TLSIE:
9554           asm_fprintf (asm_out_file, ":gottprel:");
9555           break;
9556
9557         case SYMBOL_TLSLE24:
9558           asm_fprintf (asm_out_file, ":tprel:");
9559           break;
9560
9561         case SYMBOL_TINY_GOT:
9562           gcc_unreachable ();
9563           break;
9564
9565         default:
9566           break;
9567         }
9568       output_addr_const (asm_out_file, x);
9569       break;
9570
9571     case 'L':
9572       switch (aarch64_classify_symbolic_expression (x))
9573         {
9574         case SYMBOL_SMALL_GOT_4G:
9575           asm_fprintf (asm_out_file, ":lo12:");
9576           break;
9577
9578         case SYMBOL_SMALL_TLSGD:
9579           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
9580           break;
9581
9582         case SYMBOL_SMALL_TLSDESC:
9583           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
9584           break;
9585
9586         case SYMBOL_SMALL_TLSIE:
9587           asm_fprintf (asm_out_file, ":gottprel_lo12:");
9588           break;
9589
9590         case SYMBOL_TLSLE12:
9591           asm_fprintf (asm_out_file, ":tprel_lo12:");
9592           break;
9593
9594         case SYMBOL_TLSLE24:
9595           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
9596           break;
9597
9598         case SYMBOL_TINY_GOT:
9599           asm_fprintf (asm_out_file, ":got:");
9600           break;
9601
9602         case SYMBOL_TINY_TLSIE:
9603           asm_fprintf (asm_out_file, ":gottprel:");
9604           break;
9605
9606         default:
9607           break;
9608         }
9609       output_addr_const (asm_out_file, x);
9610       break;
9611
9612     case 'G':
9613       switch (aarch64_classify_symbolic_expression (x))
9614         {
9615         case SYMBOL_TLSLE24:
9616           asm_fprintf (asm_out_file, ":tprel_hi12:");
9617           break;
9618         default:
9619           break;
9620         }
9621       output_addr_const (asm_out_file, x);
9622       break;
9623
9624     case 'k':
9625       {
9626         HOST_WIDE_INT cond_code;
9627
9628         if (!CONST_INT_P (x))
9629           {
9630             output_operand_lossage ("invalid operand for '%%%c'", code);
9631             return;
9632           }
9633
9634         cond_code = INTVAL (x);
9635         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
9636         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
9637       }
9638       break;
9639
9640     case 'y':
9641     case 'z':
9642       {
9643         machine_mode mode = GET_MODE (x);
9644
9645         if (GET_CODE (x) != MEM
9646             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
9647           {
9648             output_operand_lossage ("invalid operand for '%%%c'", code);
9649             return;
9650           }
9651
9652         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
9653                                             code == 'y'
9654                                             ? ADDR_QUERY_LDP_STP_N
9655                                             : ADDR_QUERY_LDP_STP))
9656           output_operand_lossage ("invalid operand prefix '%%%c'", code);
9657       }
9658       break;
9659
9660     default:
9661       output_operand_lossage ("invalid operand prefix '%%%c'", code);
9662       return;
9663     }
9664 }
9665
9666 /* Print address 'x' of a memory access with mode 'mode'.
9667    'op' is the context required by aarch64_classify_address.  It can either be
9668    MEM for a normal memory access or PARALLEL for LDP/STP.  */
9669 static bool
9670 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
9671                                 aarch64_addr_query_type type)
9672 {
9673   struct aarch64_address_info addr;
9674   unsigned int size, vec_flags;
9675
9676   /* Check all addresses are Pmode - including ILP32.  */
9677   if (GET_MODE (x) != Pmode
9678       && (!CONST_INT_P (x)
9679           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
9680     {
9681       output_operand_lossage ("invalid address mode");
9682       return false;
9683     }
9684
9685   if (aarch64_classify_address (&addr, x, mode, true, type))
9686     switch (addr.type)
9687       {
9688       case ADDRESS_REG_IMM:
9689         if (known_eq (addr.const_offset, 0))
9690           {
9691             asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
9692             return true;
9693           }
9694
9695         vec_flags = aarch64_classify_vector_mode (mode);
9696         if (vec_flags & VEC_ANY_SVE)
9697           {
9698             HOST_WIDE_INT vnum
9699               = exact_div (addr.const_offset,
9700                            aarch64_vl_bytes (mode, vec_flags)).to_constant ();
9701             asm_fprintf (f, "[%s, #%wd, mul vl]",
9702                          reg_names[REGNO (addr.base)], vnum);
9703             return true;
9704           }
9705
9706         asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
9707                      INTVAL (addr.offset));
9708         return true;
9709
9710       case ADDRESS_REG_REG:
9711         if (addr.shift == 0)
9712           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
9713                        reg_names [REGNO (addr.offset)]);
9714         else
9715           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
9716                        reg_names [REGNO (addr.offset)], addr.shift);
9717         return true;
9718
9719       case ADDRESS_REG_UXTW:
9720         if (addr.shift == 0)
9721           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
9722                        REGNO (addr.offset) - R0_REGNUM);
9723         else
9724           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
9725                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
9726         return true;
9727
9728       case ADDRESS_REG_SXTW:
9729         if (addr.shift == 0)
9730           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
9731                        REGNO (addr.offset) - R0_REGNUM);
9732         else
9733           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
9734                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
9735         return true;
9736
9737       case ADDRESS_REG_WB:
9738         /* Writeback is only supported for fixed-width modes.  */
9739         size = GET_MODE_SIZE (mode).to_constant ();
9740         switch (GET_CODE (x))
9741           {
9742           case PRE_INC:
9743             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
9744             return true;
9745           case POST_INC:
9746             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
9747             return true;
9748           case PRE_DEC:
9749             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
9750             return true;
9751           case POST_DEC:
9752             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
9753             return true;
9754           case PRE_MODIFY:
9755             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
9756                          INTVAL (addr.offset));
9757             return true;
9758           case POST_MODIFY:
9759             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
9760                          INTVAL (addr.offset));
9761             return true;
9762           default:
9763             break;
9764           }
9765         break;
9766
9767       case ADDRESS_LO_SUM:
9768         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
9769         output_addr_const (f, addr.offset);
9770         asm_fprintf (f, "]");
9771         return true;
9772
9773       case ADDRESS_SYMBOLIC:
9774         output_addr_const (f, x);
9775         return true;
9776       }
9777
9778   return false;
9779 }
9780
9781 /* Print address 'x' of a memory access with mode 'mode'.  */
9782 static void
9783 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
9784 {
9785   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
9786     output_addr_const (f, x);
9787 }
9788
9789 bool
9790 aarch64_label_mentioned_p (rtx x)
9791 {
9792   const char *fmt;
9793   int i;
9794
9795   if (GET_CODE (x) == LABEL_REF)
9796     return true;
9797
9798   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
9799      referencing instruction, but they are constant offsets, not
9800      symbols.  */
9801   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
9802     return false;
9803
9804   fmt = GET_RTX_FORMAT (GET_CODE (x));
9805   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
9806     {
9807       if (fmt[i] == 'E')
9808         {
9809           int j;
9810
9811           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
9812             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
9813               return 1;
9814         }
9815       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
9816         return 1;
9817     }
9818
9819   return 0;
9820 }
9821
9822 /* Implement REGNO_REG_CLASS.  */
9823
9824 enum reg_class
9825 aarch64_regno_regclass (unsigned regno)
9826 {
9827   if (GP_REGNUM_P (regno))
9828     return GENERAL_REGS;
9829
9830   if (regno == SP_REGNUM)
9831     return STACK_REG;
9832
9833   if (regno == FRAME_POINTER_REGNUM
9834       || regno == ARG_POINTER_REGNUM)
9835     return POINTER_REGS;
9836
9837   if (FP_REGNUM_P (regno))
9838     return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
9839             : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
9840
9841   if (PR_REGNUM_P (regno))
9842     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
9843
9844   if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
9845     return FFR_REGS;
9846
9847   return NO_REGS;
9848 }
9849
9850 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
9851    If OFFSET is out of range, return an offset of an anchor point
9852    that is in range.  Return 0 otherwise.  */
9853
9854 static HOST_WIDE_INT
9855 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
9856                        machine_mode mode)
9857 {
9858   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
9859   if (size > 16)
9860     return (offset + 0x400) & ~0x7f0;
9861
9862   /* For offsets that aren't a multiple of the access size, the limit is
9863      -256...255.  */
9864   if (offset & (size - 1))
9865     {
9866       /* BLKmode typically uses LDP of X-registers.  */
9867       if (mode == BLKmode)
9868         return (offset + 512) & ~0x3ff;
9869       return (offset + 0x100) & ~0x1ff;
9870     }
9871
9872   /* Small negative offsets are supported.  */
9873   if (IN_RANGE (offset, -256, 0))
9874     return 0;
9875
9876   if (mode == TImode || mode == TFmode)
9877     return (offset + 0x100) & ~0x1ff;
9878
9879   /* Use 12-bit offset by access size.  */
9880   return offset & (~0xfff * size);
9881 }
9882
9883 static rtx
9884 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
9885 {
9886   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
9887      where mask is selected by alignment and size of the offset.
9888      We try to pick as large a range for the offset as possible to
9889      maximize the chance of a CSE.  However, for aligned addresses
9890      we limit the range to 4k so that structures with different sized
9891      elements are likely to use the same base.  We need to be careful
9892      not to split a CONST for some forms of address expression, otherwise
9893      it will generate sub-optimal code.  */
9894
9895   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
9896     {
9897       rtx base = XEXP (x, 0);
9898       rtx offset_rtx = XEXP (x, 1);
9899       HOST_WIDE_INT offset = INTVAL (offset_rtx);
9900
9901       if (GET_CODE (base) == PLUS)
9902         {
9903           rtx op0 = XEXP (base, 0);
9904           rtx op1 = XEXP (base, 1);
9905
9906           /* Force any scaling into a temp for CSE.  */
9907           op0 = force_reg (Pmode, op0);
9908           op1 = force_reg (Pmode, op1);
9909
9910           /* Let the pointer register be in op0.  */
9911           if (REG_POINTER (op1))
9912             std::swap (op0, op1);
9913
9914           /* If the pointer is virtual or frame related, then we know that
9915              virtual register instantiation or register elimination is going
9916              to apply a second constant.  We want the two constants folded
9917              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
9918           if (virt_or_elim_regno_p (REGNO (op0)))
9919             {
9920               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
9921                                    NULL_RTX, true, OPTAB_DIRECT);
9922               return gen_rtx_PLUS (Pmode, base, op1);
9923             }
9924
9925           /* Otherwise, in order to encourage CSE (and thence loop strength
9926              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
9927           base = expand_binop (Pmode, add_optab, op0, op1,
9928                                NULL_RTX, true, OPTAB_DIRECT);
9929           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
9930         }
9931
9932       HOST_WIDE_INT size;
9933       if (GET_MODE_SIZE (mode).is_constant (&size))
9934         {
9935           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
9936                                                              mode);
9937           if (base_offset != 0)
9938             {
9939               base = plus_constant (Pmode, base, base_offset);
9940               base = force_operand (base, NULL_RTX);
9941               return plus_constant (Pmode, base, offset - base_offset);
9942             }
9943         }
9944     }
9945
9946   return x;
9947 }
9948
9949 static reg_class_t
9950 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
9951                           reg_class_t rclass,
9952                           machine_mode mode,
9953                           secondary_reload_info *sri)
9954 {
9955   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
9956      directly by the *aarch64_sve_mov<mode>_[lb]e move patterns.  See the
9957      comment at the head of aarch64-sve.md for more details about the
9958      big-endian handling.  */
9959   if (BYTES_BIG_ENDIAN
9960       && reg_class_subset_p (rclass, FP_REGS)
9961       && !((REG_P (x) && HARD_REGISTER_P (x))
9962            || aarch64_simd_valid_immediate (x, NULL))
9963       && mode != VNx16QImode
9964       && aarch64_sve_data_mode_p (mode))
9965     {
9966       sri->icode = CODE_FOR_aarch64_sve_reload_be;
9967       return NO_REGS;
9968     }
9969
9970   /* If we have to disable direct literal pool loads and stores because the
9971      function is too big, then we need a scratch register.  */
9972   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
9973       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
9974           || targetm.vector_mode_supported_p (GET_MODE (x)))
9975       && !aarch64_pcrelative_literal_loads)
9976     {
9977       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
9978       return NO_REGS;
9979     }
9980
9981   /* Without the TARGET_SIMD instructions we cannot move a Q register
9982      to a Q register directly.  We need a scratch.  */
9983   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
9984       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
9985       && reg_class_subset_p (rclass, FP_REGS))
9986     {
9987       sri->icode = code_for_aarch64_reload_mov (mode);
9988       return NO_REGS;
9989     }
9990
9991   /* A TFmode or TImode memory access should be handled via an FP_REGS
9992      because AArch64 has richer addressing modes for LDR/STR instructions
9993      than LDP/STP instructions.  */
9994   if (TARGET_FLOAT && rclass == GENERAL_REGS
9995       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
9996     return FP_REGS;
9997
9998   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
9999       return GENERAL_REGS;
10000
10001   return NO_REGS;
10002 }
10003
10004 static bool
10005 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
10006 {
10007   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
10008
10009   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
10010      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
10011   if (frame_pointer_needed)
10012     return to == HARD_FRAME_POINTER_REGNUM;
10013   return true;
10014 }
10015
10016 poly_int64
10017 aarch64_initial_elimination_offset (unsigned from, unsigned to)
10018 {
10019   if (to == HARD_FRAME_POINTER_REGNUM)
10020     {
10021       if (from == ARG_POINTER_REGNUM)
10022         return cfun->machine->frame.hard_fp_offset;
10023
10024       if (from == FRAME_POINTER_REGNUM)
10025         return cfun->machine->frame.hard_fp_offset
10026                - cfun->machine->frame.locals_offset;
10027     }
10028
10029   if (to == STACK_POINTER_REGNUM)
10030     {
10031       if (from == FRAME_POINTER_REGNUM)
10032           return cfun->machine->frame.frame_size
10033                  - cfun->machine->frame.locals_offset;
10034     }
10035
10036   return cfun->machine->frame.frame_size;
10037 }
10038
10039 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
10040    previous frame.  */
10041
10042 rtx
10043 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
10044 {
10045   if (count != 0)
10046     return const0_rtx;
10047   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
10048 }
10049
10050
10051 static void
10052 aarch64_asm_trampoline_template (FILE *f)
10053 {
10054   int offset1 = 16;
10055   int offset2 = 20;
10056
10057   if (aarch64_bti_enabled ())
10058     {
10059       asm_fprintf (f, "\thint\t34 // bti c\n");
10060       offset1 -= 4;
10061       offset2 -= 4;
10062     }
10063
10064   if (TARGET_ILP32)
10065     {
10066       asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
10067       asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
10068                    offset1);
10069     }
10070   else
10071     {
10072       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
10073       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
10074                    offset2);
10075     }
10076   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
10077
10078   /* The trampoline needs an extra padding instruction.  In case if BTI is
10079      enabled the padding instruction is replaced by the BTI instruction at
10080      the beginning.  */
10081   if (!aarch64_bti_enabled ())
10082     assemble_aligned_integer (4, const0_rtx);
10083
10084   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
10085   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
10086 }
10087
10088 static void
10089 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
10090 {
10091   rtx fnaddr, mem, a_tramp;
10092   const int tramp_code_sz = 16;
10093
10094   /* Don't need to copy the trailing D-words, we fill those in below.  */
10095   emit_block_move (m_tramp, assemble_trampoline_template (),
10096                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
10097   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
10098   fnaddr = XEXP (DECL_RTL (fndecl), 0);
10099   if (GET_MODE (fnaddr) != ptr_mode)
10100     fnaddr = convert_memory_address (ptr_mode, fnaddr);
10101   emit_move_insn (mem, fnaddr);
10102
10103   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
10104   emit_move_insn (mem, chain_value);
10105
10106   /* XXX We should really define a "clear_cache" pattern and use
10107      gen_clear_cache().  */
10108   a_tramp = XEXP (m_tramp, 0);
10109   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
10110                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
10111                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
10112                      ptr_mode);
10113 }
10114
10115 static unsigned char
10116 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
10117 {
10118   /* ??? Logically we should only need to provide a value when
10119      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
10120      can hold MODE, but at the moment we need to handle all modes.
10121      Just ignore any runtime parts for registers that can't store them.  */
10122   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
10123   unsigned int nregs, vec_flags;
10124   switch (regclass)
10125     {
10126     case TAILCALL_ADDR_REGS:
10127     case POINTER_REGS:
10128     case GENERAL_REGS:
10129     case ALL_REGS:
10130     case POINTER_AND_FP_REGS:
10131     case FP_REGS:
10132     case FP_LO_REGS:
10133     case FP_LO8_REGS:
10134       vec_flags = aarch64_classify_vector_mode (mode);
10135       if ((vec_flags & VEC_SVE_DATA)
10136           && constant_multiple_p (GET_MODE_SIZE (mode),
10137                                   aarch64_vl_bytes (mode, vec_flags), &nregs))
10138         return nregs;
10139       return (vec_flags & VEC_ADVSIMD
10140               ? CEIL (lowest_size, UNITS_PER_VREG)
10141               : CEIL (lowest_size, UNITS_PER_WORD));
10142     case STACK_REG:
10143     case PR_REGS:
10144     case PR_LO_REGS:
10145     case PR_HI_REGS:
10146     case FFR_REGS:
10147     case PR_AND_FFR_REGS:
10148       return 1;
10149
10150     case NO_REGS:
10151       return 0;
10152
10153     default:
10154       break;
10155     }
10156   gcc_unreachable ();
10157 }
10158
10159 static reg_class_t
10160 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
10161 {
10162   if (regclass == POINTER_REGS)
10163     return GENERAL_REGS;
10164
10165   if (regclass == STACK_REG)
10166     {
10167       if (REG_P(x)
10168           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
10169           return regclass;
10170
10171       return NO_REGS;
10172     }
10173
10174   /* Register eliminiation can result in a request for
10175      SP+constant->FP_REGS.  We cannot support such operations which
10176      use SP as source and an FP_REG as destination, so reject out
10177      right now.  */
10178   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
10179     {
10180       rtx lhs = XEXP (x, 0);
10181
10182       /* Look through a possible SUBREG introduced by ILP32.  */
10183       if (GET_CODE (lhs) == SUBREG)
10184         lhs = SUBREG_REG (lhs);
10185
10186       gcc_assert (REG_P (lhs));
10187       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
10188                                       POINTER_REGS));
10189       return NO_REGS;
10190     }
10191
10192   return regclass;
10193 }
10194
10195 void
10196 aarch64_asm_output_labelref (FILE* f, const char *name)
10197 {
10198   asm_fprintf (f, "%U%s", name);
10199 }
10200
10201 static void
10202 aarch64_elf_asm_constructor (rtx symbol, int priority)
10203 {
10204   if (priority == DEFAULT_INIT_PRIORITY)
10205     default_ctor_section_asm_out_constructor (symbol, priority);
10206   else
10207     {
10208       section *s;
10209       /* While priority is known to be in range [0, 65535], so 18 bytes
10210          would be enough, the compiler might not know that.  To avoid
10211          -Wformat-truncation false positive, use a larger size.  */
10212       char buf[23];
10213       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
10214       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
10215       switch_to_section (s);
10216       assemble_align (POINTER_SIZE);
10217       assemble_aligned_integer (POINTER_BYTES, symbol);
10218     }
10219 }
10220
10221 static void
10222 aarch64_elf_asm_destructor (rtx symbol, int priority)
10223 {
10224   if (priority == DEFAULT_INIT_PRIORITY)
10225     default_dtor_section_asm_out_destructor (symbol, priority);
10226   else
10227     {
10228       section *s;
10229       /* While priority is known to be in range [0, 65535], so 18 bytes
10230          would be enough, the compiler might not know that.  To avoid
10231          -Wformat-truncation false positive, use a larger size.  */
10232       char buf[23];
10233       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
10234       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
10235       switch_to_section (s);
10236       assemble_align (POINTER_SIZE);
10237       assemble_aligned_integer (POINTER_BYTES, symbol);
10238     }
10239 }
10240
10241 const char*
10242 aarch64_output_casesi (rtx *operands)
10243 {
10244   char buf[100];
10245   char label[100];
10246   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
10247   int index;
10248   static const char *const patterns[4][2] =
10249   {
10250     {
10251       "ldrb\t%w3, [%0,%w1,uxtw]",
10252       "add\t%3, %4, %w3, sxtb #2"
10253     },
10254     {
10255       "ldrh\t%w3, [%0,%w1,uxtw #1]",
10256       "add\t%3, %4, %w3, sxth #2"
10257     },
10258     {
10259       "ldr\t%w3, [%0,%w1,uxtw #2]",
10260       "add\t%3, %4, %w3, sxtw #2"
10261     },
10262     /* We assume that DImode is only generated when not optimizing and
10263        that we don't really need 64-bit address offsets.  That would
10264        imply an object file with 8GB of code in a single function!  */
10265     {
10266       "ldr\t%w3, [%0,%w1,uxtw #2]",
10267       "add\t%3, %4, %w3, sxtw #2"
10268     }
10269   };
10270
10271   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
10272
10273   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
10274   index = exact_log2 (GET_MODE_SIZE (mode));
10275
10276   gcc_assert (index >= 0 && index <= 3);
10277
10278   /* Need to implement table size reduction, by chaning the code below.  */
10279   output_asm_insn (patterns[index][0], operands);
10280   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
10281   snprintf (buf, sizeof (buf),
10282             "adr\t%%4, %s", targetm.strip_name_encoding (label));
10283   output_asm_insn (buf, operands);
10284   output_asm_insn (patterns[index][1], operands);
10285   output_asm_insn ("br\t%3", operands);
10286   assemble_label (asm_out_file, label);
10287   return "";
10288 }
10289
10290
10291 /* Return size in bits of an arithmetic operand which is shifted/scaled and
10292    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
10293    operator.  */
10294
10295 int
10296 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
10297 {
10298   if (shift >= 0 && shift <= 3)
10299     {
10300       int size;
10301       for (size = 8; size <= 32; size *= 2)
10302         {
10303           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
10304           if (mask == bits << shift)
10305             return size;
10306         }
10307     }
10308   return 0;
10309 }
10310
10311 /* Constant pools are per function only when PC relative
10312    literal loads are true or we are in the large memory
10313    model.  */
10314
10315 static inline bool
10316 aarch64_can_use_per_function_literal_pools_p (void)
10317 {
10318   return (aarch64_pcrelative_literal_loads
10319           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
10320 }
10321
10322 static bool
10323 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
10324 {
10325   /* We can't use blocks for constants when we're using a per-function
10326      constant pool.  */
10327   return !aarch64_can_use_per_function_literal_pools_p ();
10328 }
10329
10330 /* Select appropriate section for constants depending
10331    on where we place literal pools.  */
10332
10333 static section *
10334 aarch64_select_rtx_section (machine_mode mode,
10335                             rtx x,
10336                             unsigned HOST_WIDE_INT align)
10337 {
10338   if (aarch64_can_use_per_function_literal_pools_p ())
10339     return function_section (current_function_decl);
10340
10341   return default_elf_select_rtx_section (mode, x, align);
10342 }
10343
10344 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
10345 void
10346 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
10347                                   HOST_WIDE_INT offset)
10348 {
10349   /* When using per-function literal pools, we must ensure that any code
10350      section is aligned to the minimal instruction length, lest we get
10351      errors from the assembler re "unaligned instructions".  */
10352   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
10353     ASM_OUTPUT_ALIGN (f, 2);
10354 }
10355
10356 /* Costs.  */
10357
10358 /* Helper function for rtx cost calculation.  Strip a shift expression
10359    from X.  Returns the inner operand if successful, or the original
10360    expression on failure.  */
10361 static rtx
10362 aarch64_strip_shift (rtx x)
10363 {
10364   rtx op = x;
10365
10366   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
10367      we can convert both to ROR during final output.  */
10368   if ((GET_CODE (op) == ASHIFT
10369        || GET_CODE (op) == ASHIFTRT
10370        || GET_CODE (op) == LSHIFTRT
10371        || GET_CODE (op) == ROTATERT
10372        || GET_CODE (op) == ROTATE)
10373       && CONST_INT_P (XEXP (op, 1)))
10374     return XEXP (op, 0);
10375
10376   if (GET_CODE (op) == MULT
10377       && CONST_INT_P (XEXP (op, 1))
10378       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
10379     return XEXP (op, 0);
10380
10381   return x;
10382 }
10383
10384 /* Helper function for rtx cost calculation.  Strip an extend
10385    expression from X.  Returns the inner operand if successful, or the
10386    original expression on failure.  We deal with a number of possible
10387    canonicalization variations here. If STRIP_SHIFT is true, then
10388    we can strip off a shift also.  */
10389 static rtx
10390 aarch64_strip_extend (rtx x, bool strip_shift)
10391 {
10392   scalar_int_mode mode;
10393   rtx op = x;
10394
10395   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
10396     return op;
10397
10398   /* Zero and sign extraction of a widened value.  */
10399   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
10400       && XEXP (op, 2) == const0_rtx
10401       && GET_CODE (XEXP (op, 0)) == MULT
10402       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
10403                                          XEXP (op, 1)))
10404     return XEXP (XEXP (op, 0), 0);
10405
10406   /* It can also be represented (for zero-extend) as an AND with an
10407      immediate.  */
10408   if (GET_CODE (op) == AND
10409       && GET_CODE (XEXP (op, 0)) == MULT
10410       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
10411       && CONST_INT_P (XEXP (op, 1))
10412       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
10413                            INTVAL (XEXP (op, 1))) != 0)
10414     return XEXP (XEXP (op, 0), 0);
10415
10416   /* Now handle extended register, as this may also have an optional
10417      left shift by 1..4.  */
10418   if (strip_shift
10419       && GET_CODE (op) == ASHIFT
10420       && CONST_INT_P (XEXP (op, 1))
10421       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
10422     op = XEXP (op, 0);
10423
10424   if (GET_CODE (op) == ZERO_EXTEND
10425       || GET_CODE (op) == SIGN_EXTEND)
10426     op = XEXP (op, 0);
10427
10428   if (op != x)
10429     return op;
10430
10431   return x;
10432 }
10433
10434 /* Return true iff CODE is a shift supported in combination
10435    with arithmetic instructions.  */
10436
10437 static bool
10438 aarch64_shift_p (enum rtx_code code)
10439 {
10440   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
10441 }
10442
10443
10444 /* Return true iff X is a cheap shift without a sign extend. */
10445
10446 static bool
10447 aarch64_cheap_mult_shift_p (rtx x)
10448 {
10449   rtx op0, op1;
10450
10451   op0 = XEXP (x, 0);
10452   op1 = XEXP (x, 1);
10453
10454   if (!(aarch64_tune_params.extra_tuning_flags
10455                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
10456     return false;
10457
10458   if (GET_CODE (op0) == SIGN_EXTEND)
10459     return false;
10460
10461   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
10462       && UINTVAL (op1) <= 4)
10463     return true;
10464
10465   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
10466     return false;
10467
10468   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
10469
10470   if (l2 > 0 && l2 <= 4)
10471     return true;
10472
10473   return false;
10474 }
10475
10476 /* Helper function for rtx cost calculation.  Calculate the cost of
10477    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
10478    Return the calculated cost of the expression, recursing manually in to
10479    operands where needed.  */
10480
10481 static int
10482 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
10483 {
10484   rtx op0, op1;
10485   const struct cpu_cost_table *extra_cost
10486     = aarch64_tune_params.insn_extra_cost;
10487   int cost = 0;
10488   bool compound_p = (outer == PLUS || outer == MINUS);
10489   machine_mode mode = GET_MODE (x);
10490
10491   gcc_checking_assert (code == MULT);
10492
10493   op0 = XEXP (x, 0);
10494   op1 = XEXP (x, 1);
10495
10496   if (VECTOR_MODE_P (mode))
10497     mode = GET_MODE_INNER (mode);
10498
10499   /* Integer multiply/fma.  */
10500   if (GET_MODE_CLASS (mode) == MODE_INT)
10501     {
10502       /* The multiply will be canonicalized as a shift, cost it as such.  */
10503       if (aarch64_shift_p (GET_CODE (x))
10504           || (CONST_INT_P (op1)
10505               && exact_log2 (INTVAL (op1)) > 0))
10506         {
10507           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
10508                            || GET_CODE (op0) == SIGN_EXTEND;
10509           if (speed)
10510             {
10511               if (compound_p)
10512                 {
10513                   /* If the shift is considered cheap,
10514                      then don't add any cost. */
10515                   if (aarch64_cheap_mult_shift_p (x))
10516                     ;
10517                   else if (REG_P (op1))
10518                     /* ARITH + shift-by-register.  */
10519                     cost += extra_cost->alu.arith_shift_reg;
10520                   else if (is_extend)
10521                     /* ARITH + extended register.  We don't have a cost field
10522                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
10523                     cost += extra_cost->alu.extend_arith;
10524                   else
10525                     /* ARITH + shift-by-immediate.  */
10526                     cost += extra_cost->alu.arith_shift;
10527                 }
10528               else
10529                 /* LSL (immediate).  */
10530                 cost += extra_cost->alu.shift;
10531
10532             }
10533           /* Strip extends as we will have costed them in the case above.  */
10534           if (is_extend)
10535             op0 = aarch64_strip_extend (op0, true);
10536
10537           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
10538
10539           return cost;
10540         }
10541
10542       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
10543          compound and let the below cases handle it.  After all, MNEG is a
10544          special-case alias of MSUB.  */
10545       if (GET_CODE (op0) == NEG)
10546         {
10547           op0 = XEXP (op0, 0);
10548           compound_p = true;
10549         }
10550
10551       /* Integer multiplies or FMAs have zero/sign extending variants.  */
10552       if ((GET_CODE (op0) == ZERO_EXTEND
10553            && GET_CODE (op1) == ZERO_EXTEND)
10554           || (GET_CODE (op0) == SIGN_EXTEND
10555               && GET_CODE (op1) == SIGN_EXTEND))
10556         {
10557           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
10558           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
10559
10560           if (speed)
10561             {
10562               if (compound_p)
10563                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
10564                 cost += extra_cost->mult[0].extend_add;
10565               else
10566                 /* MUL/SMULL/UMULL.  */
10567                 cost += extra_cost->mult[0].extend;
10568             }
10569
10570           return cost;
10571         }
10572
10573       /* This is either an integer multiply or a MADD.  In both cases
10574          we want to recurse and cost the operands.  */
10575       cost += rtx_cost (op0, mode, MULT, 0, speed);
10576       cost += rtx_cost (op1, mode, MULT, 1, speed);
10577
10578       if (speed)
10579         {
10580           if (compound_p)
10581             /* MADD/MSUB.  */
10582             cost += extra_cost->mult[mode == DImode].add;
10583           else
10584             /* MUL.  */
10585             cost += extra_cost->mult[mode == DImode].simple;
10586         }
10587
10588       return cost;
10589     }
10590   else
10591     {
10592       if (speed)
10593         {
10594           /* Floating-point FMA/FMUL can also support negations of the
10595              operands, unless the rounding mode is upward or downward in
10596              which case FNMUL is different than FMUL with operand negation.  */
10597           bool neg0 = GET_CODE (op0) == NEG;
10598           bool neg1 = GET_CODE (op1) == NEG;
10599           if (compound_p || !flag_rounding_math || (neg0 && neg1))
10600             {
10601               if (neg0)
10602                 op0 = XEXP (op0, 0);
10603               if (neg1)
10604                 op1 = XEXP (op1, 0);
10605             }
10606
10607           if (compound_p)
10608             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
10609             cost += extra_cost->fp[mode == DFmode].fma;
10610           else
10611             /* FMUL/FNMUL.  */
10612             cost += extra_cost->fp[mode == DFmode].mult;
10613         }
10614
10615       cost += rtx_cost (op0, mode, MULT, 0, speed);
10616       cost += rtx_cost (op1, mode, MULT, 1, speed);
10617       return cost;
10618     }
10619 }
10620
10621 static int
10622 aarch64_address_cost (rtx x,
10623                       machine_mode mode,
10624                       addr_space_t as ATTRIBUTE_UNUSED,
10625                       bool speed)
10626 {
10627   enum rtx_code c = GET_CODE (x);
10628   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
10629   struct aarch64_address_info info;
10630   int cost = 0;
10631   info.shift = 0;
10632
10633   if (!aarch64_classify_address (&info, x, mode, false))
10634     {
10635       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
10636         {
10637           /* This is a CONST or SYMBOL ref which will be split
10638              in a different way depending on the code model in use.
10639              Cost it through the generic infrastructure.  */
10640           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
10641           /* Divide through by the cost of one instruction to
10642              bring it to the same units as the address costs.  */
10643           cost_symbol_ref /= COSTS_N_INSNS (1);
10644           /* The cost is then the cost of preparing the address,
10645              followed by an immediate (possibly 0) offset.  */
10646           return cost_symbol_ref + addr_cost->imm_offset;
10647         }
10648       else
10649         {
10650           /* This is most likely a jump table from a case
10651              statement.  */
10652           return addr_cost->register_offset;
10653         }
10654     }
10655
10656   switch (info.type)
10657     {
10658       case ADDRESS_LO_SUM:
10659       case ADDRESS_SYMBOLIC:
10660       case ADDRESS_REG_IMM:
10661         cost += addr_cost->imm_offset;
10662         break;
10663
10664       case ADDRESS_REG_WB:
10665         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
10666           cost += addr_cost->pre_modify;
10667         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
10668           cost += addr_cost->post_modify;
10669         else
10670           gcc_unreachable ();
10671
10672         break;
10673
10674       case ADDRESS_REG_REG:
10675         cost += addr_cost->register_offset;
10676         break;
10677
10678       case ADDRESS_REG_SXTW:
10679         cost += addr_cost->register_sextend;
10680         break;
10681
10682       case ADDRESS_REG_UXTW:
10683         cost += addr_cost->register_zextend;
10684         break;
10685
10686       default:
10687         gcc_unreachable ();
10688     }
10689
10690
10691   if (info.shift > 0)
10692     {
10693       /* For the sake of calculating the cost of the shifted register
10694          component, we can treat same sized modes in the same way.  */
10695       if (known_eq (GET_MODE_BITSIZE (mode), 16))
10696         cost += addr_cost->addr_scale_costs.hi;
10697       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
10698         cost += addr_cost->addr_scale_costs.si;
10699       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
10700         cost += addr_cost->addr_scale_costs.di;
10701       else
10702         /* We can't tell, or this is a 128-bit vector.  */
10703         cost += addr_cost->addr_scale_costs.ti;
10704     }
10705
10706   return cost;
10707 }
10708
10709 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
10710    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
10711    to be taken.  */
10712
10713 int
10714 aarch64_branch_cost (bool speed_p, bool predictable_p)
10715 {
10716   /* When optimizing for speed, use the cost of unpredictable branches.  */
10717   const struct cpu_branch_cost *branch_costs =
10718     aarch64_tune_params.branch_costs;
10719
10720   if (!speed_p || predictable_p)
10721     return branch_costs->predictable;
10722   else
10723     return branch_costs->unpredictable;
10724 }
10725
10726 /* Return true if the RTX X in mode MODE is a zero or sign extract
10727    usable in an ADD or SUB (extended register) instruction.  */
10728 static bool
10729 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
10730 {
10731   /* Catch add with a sign extract.
10732      This is add_<optab><mode>_multp2.  */
10733   if (GET_CODE (x) == SIGN_EXTRACT
10734       || GET_CODE (x) == ZERO_EXTRACT)
10735     {
10736       rtx op0 = XEXP (x, 0);
10737       rtx op1 = XEXP (x, 1);
10738       rtx op2 = XEXP (x, 2);
10739
10740       if (GET_CODE (op0) == MULT
10741           && CONST_INT_P (op1)
10742           && op2 == const0_rtx
10743           && CONST_INT_P (XEXP (op0, 1))
10744           && aarch64_is_extend_from_extract (mode,
10745                                              XEXP (op0, 1),
10746                                              op1))
10747         {
10748           return true;
10749         }
10750     }
10751   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
10752      No shift.  */
10753   else if (GET_CODE (x) == SIGN_EXTEND
10754            || GET_CODE (x) == ZERO_EXTEND)
10755     return REG_P (XEXP (x, 0));
10756
10757   return false;
10758 }
10759
10760 static bool
10761 aarch64_frint_unspec_p (unsigned int u)
10762 {
10763   switch (u)
10764     {
10765       case UNSPEC_FRINTZ:
10766       case UNSPEC_FRINTP:
10767       case UNSPEC_FRINTM:
10768       case UNSPEC_FRINTA:
10769       case UNSPEC_FRINTN:
10770       case UNSPEC_FRINTX:
10771       case UNSPEC_FRINTI:
10772         return true;
10773
10774       default:
10775         return false;
10776     }
10777 }
10778
10779 /* Return true iff X is an rtx that will match an extr instruction
10780    i.e. as described in the *extr<mode>5_insn family of patterns.
10781    OP0 and OP1 will be set to the operands of the shifts involved
10782    on success and will be NULL_RTX otherwise.  */
10783
10784 static bool
10785 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
10786 {
10787   rtx op0, op1;
10788   scalar_int_mode mode;
10789   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
10790     return false;
10791
10792   *res_op0 = NULL_RTX;
10793   *res_op1 = NULL_RTX;
10794
10795   if (GET_CODE (x) != IOR)
10796     return false;
10797
10798   op0 = XEXP (x, 0);
10799   op1 = XEXP (x, 1);
10800
10801   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
10802       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
10803     {
10804      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
10805       if (GET_CODE (op1) == ASHIFT)
10806         std::swap (op0, op1);
10807
10808       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
10809         return false;
10810
10811       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
10812       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
10813
10814       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
10815           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
10816         {
10817           *res_op0 = XEXP (op0, 0);
10818           *res_op1 = XEXP (op1, 0);
10819           return true;
10820         }
10821     }
10822
10823   return false;
10824 }
10825
10826 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
10827    storing it in *COST.  Result is true if the total cost of the operation
10828    has now been calculated.  */
10829 static bool
10830 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
10831 {
10832   rtx inner;
10833   rtx comparator;
10834   enum rtx_code cmpcode;
10835
10836   if (COMPARISON_P (op0))
10837     {
10838       inner = XEXP (op0, 0);
10839       comparator = XEXP (op0, 1);
10840       cmpcode = GET_CODE (op0);
10841     }
10842   else
10843     {
10844       inner = op0;
10845       comparator = const0_rtx;
10846       cmpcode = NE;
10847     }
10848
10849   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
10850     {
10851       /* Conditional branch.  */
10852       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10853         return true;
10854       else
10855         {
10856           if (cmpcode == NE || cmpcode == EQ)
10857             {
10858               if (comparator == const0_rtx)
10859                 {
10860                   /* TBZ/TBNZ/CBZ/CBNZ.  */
10861                   if (GET_CODE (inner) == ZERO_EXTRACT)
10862                     /* TBZ/TBNZ.  */
10863                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
10864                                        ZERO_EXTRACT, 0, speed);
10865                   else
10866                     /* CBZ/CBNZ.  */
10867                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
10868
10869                 return true;
10870               }
10871             }
10872           else if (cmpcode == LT || cmpcode == GE)
10873             {
10874               /* TBZ/TBNZ.  */
10875               if (comparator == const0_rtx)
10876                 return true;
10877             }
10878         }
10879     }
10880   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10881     {
10882       /* CCMP.  */
10883       if (GET_CODE (op1) == COMPARE)
10884         {
10885           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
10886           if (XEXP (op1, 1) == const0_rtx)
10887             *cost += 1;
10888           if (speed)
10889             {
10890               machine_mode mode = GET_MODE (XEXP (op1, 0));
10891               const struct cpu_cost_table *extra_cost
10892                 = aarch64_tune_params.insn_extra_cost;
10893
10894               if (GET_MODE_CLASS (mode) == MODE_INT)
10895                 *cost += extra_cost->alu.arith;
10896               else
10897                 *cost += extra_cost->fp[mode == DFmode].compare;
10898             }
10899           return true;
10900         }
10901
10902       /* It's a conditional operation based on the status flags,
10903          so it must be some flavor of CSEL.  */
10904
10905       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
10906       if (GET_CODE (op1) == NEG
10907           || GET_CODE (op1) == NOT
10908           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
10909         op1 = XEXP (op1, 0);
10910       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
10911         {
10912           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
10913           op1 = XEXP (op1, 0);
10914           op2 = XEXP (op2, 0);
10915         }
10916
10917       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
10918       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
10919       return true;
10920     }
10921
10922   /* We don't know what this is, cost all operands.  */
10923   return false;
10924 }
10925
10926 /* Check whether X is a bitfield operation of the form shift + extend that
10927    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
10928    operand to which the bitfield operation is applied.  Otherwise return
10929    NULL_RTX.  */
10930
10931 static rtx
10932 aarch64_extend_bitfield_pattern_p (rtx x)
10933 {
10934   rtx_code outer_code = GET_CODE (x);
10935   machine_mode outer_mode = GET_MODE (x);
10936
10937   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
10938       && outer_mode != SImode && outer_mode != DImode)
10939     return NULL_RTX;
10940
10941   rtx inner = XEXP (x, 0);
10942   rtx_code inner_code = GET_CODE (inner);
10943   machine_mode inner_mode = GET_MODE (inner);
10944   rtx op = NULL_RTX;
10945
10946   switch (inner_code)
10947     {
10948       case ASHIFT:
10949         if (CONST_INT_P (XEXP (inner, 1))
10950             && (inner_mode == QImode || inner_mode == HImode))
10951           op = XEXP (inner, 0);
10952         break;
10953       case LSHIFTRT:
10954         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
10955             && (inner_mode == QImode || inner_mode == HImode))
10956           op = XEXP (inner, 0);
10957         break;
10958       case ASHIFTRT:
10959         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
10960             && (inner_mode == QImode || inner_mode == HImode))
10961           op = XEXP (inner, 0);
10962         break;
10963       default:
10964         break;
10965     }
10966
10967   return op;
10968 }
10969
10970 /* Return true if the mask and a shift amount from an RTX of the form
10971    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
10972    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
10973
10974 bool
10975 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
10976                                     rtx shft_amnt)
10977 {
10978   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
10979          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
10980          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
10981          && (INTVAL (mask)
10982              & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
10983 }
10984
10985 /* Return true if the masks and a shift amount from an RTX of the form
10986    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
10987    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
10988
10989 bool
10990 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
10991                                    unsigned HOST_WIDE_INT mask1,
10992                                    unsigned HOST_WIDE_INT shft_amnt,
10993                                    unsigned HOST_WIDE_INT mask2)
10994 {
10995   unsigned HOST_WIDE_INT t;
10996
10997   /* Verify that there is no overlap in what bits are set in the two masks.  */
10998   if (mask1 != ~mask2)
10999     return false;
11000
11001   /* Verify that mask2 is not all zeros or ones.  */
11002   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
11003     return false;
11004
11005   /* The shift amount should always be less than the mode size.  */
11006   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
11007
11008   /* Verify that the mask being shifted is contiguous and would be in the
11009      least significant bits after shifting by shft_amnt.  */
11010   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
11011   return (t == (t & -t));
11012 }
11013
11014 /* Calculate the cost of calculating X, storing it in *COST.  Result
11015    is true if the total cost of the operation has now been calculated.  */
11016 static bool
11017 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
11018                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
11019 {
11020   rtx op0, op1, op2;
11021   const struct cpu_cost_table *extra_cost
11022     = aarch64_tune_params.insn_extra_cost;
11023   int code = GET_CODE (x);
11024   scalar_int_mode int_mode;
11025
11026   /* By default, assume that everything has equivalent cost to the
11027      cheapest instruction.  Any additional costs are applied as a delta
11028      above this default.  */
11029   *cost = COSTS_N_INSNS (1);
11030
11031   switch (code)
11032     {
11033     case SET:
11034       /* The cost depends entirely on the operands to SET.  */
11035       *cost = 0;
11036       op0 = SET_DEST (x);
11037       op1 = SET_SRC (x);
11038
11039       switch (GET_CODE (op0))
11040         {
11041         case MEM:
11042           if (speed)
11043             {
11044               rtx address = XEXP (op0, 0);
11045               if (VECTOR_MODE_P (mode))
11046                 *cost += extra_cost->ldst.storev;
11047               else if (GET_MODE_CLASS (mode) == MODE_INT)
11048                 *cost += extra_cost->ldst.store;
11049               else if (mode == SFmode)
11050                 *cost += extra_cost->ldst.storef;
11051               else if (mode == DFmode)
11052                 *cost += extra_cost->ldst.stored;
11053
11054               *cost +=
11055                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11056                                                      0, speed));
11057             }
11058
11059           *cost += rtx_cost (op1, mode, SET, 1, speed);
11060           return true;
11061
11062         case SUBREG:
11063           if (! REG_P (SUBREG_REG (op0)))
11064             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
11065
11066           /* Fall through.  */
11067         case REG:
11068           /* The cost is one per vector-register copied.  */
11069           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
11070             {
11071               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
11072               *cost = COSTS_N_INSNS (nregs);
11073             }
11074           /* const0_rtx is in general free, but we will use an
11075              instruction to set a register to 0.  */
11076           else if (REG_P (op1) || op1 == const0_rtx)
11077             {
11078               /* The cost is 1 per register copied.  */
11079               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
11080               *cost = COSTS_N_INSNS (nregs);
11081             }
11082           else
11083             /* Cost is just the cost of the RHS of the set.  */
11084             *cost += rtx_cost (op1, mode, SET, 1, speed);
11085           return true;
11086
11087         case ZERO_EXTRACT:
11088         case SIGN_EXTRACT:
11089           /* Bit-field insertion.  Strip any redundant widening of
11090              the RHS to meet the width of the target.  */
11091           if (GET_CODE (op1) == SUBREG)
11092             op1 = SUBREG_REG (op1);
11093           if ((GET_CODE (op1) == ZERO_EXTEND
11094                || GET_CODE (op1) == SIGN_EXTEND)
11095               && CONST_INT_P (XEXP (op0, 1))
11096               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
11097               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
11098             op1 = XEXP (op1, 0);
11099
11100           if (CONST_INT_P (op1))
11101             {
11102               /* MOV immediate is assumed to always be cheap.  */
11103               *cost = COSTS_N_INSNS (1);
11104             }
11105           else
11106             {
11107               /* BFM.  */
11108               if (speed)
11109                 *cost += extra_cost->alu.bfi;
11110               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
11111             }
11112
11113           return true;
11114
11115         default:
11116           /* We can't make sense of this, assume default cost.  */
11117           *cost = COSTS_N_INSNS (1);
11118           return false;
11119         }
11120       return false;
11121
11122     case CONST_INT:
11123       /* If an instruction can incorporate a constant within the
11124          instruction, the instruction's expression avoids calling
11125          rtx_cost() on the constant.  If rtx_cost() is called on a
11126          constant, then it is usually because the constant must be
11127          moved into a register by one or more instructions.
11128
11129          The exception is constant 0, which can be expressed
11130          as XZR/WZR and is therefore free.  The exception to this is
11131          if we have (set (reg) (const0_rtx)) in which case we must cost
11132          the move.  However, we can catch that when we cost the SET, so
11133          we don't need to consider that here.  */
11134       if (x == const0_rtx)
11135         *cost = 0;
11136       else
11137         {
11138           /* To an approximation, building any other constant is
11139              proportionally expensive to the number of instructions
11140              required to build that constant.  This is true whether we
11141              are compiling for SPEED or otherwise.  */
11142           if (!is_a <scalar_int_mode> (mode, &int_mode))
11143             int_mode = word_mode;
11144           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
11145                                  (NULL_RTX, x, false, int_mode));
11146         }
11147       return true;
11148
11149     case CONST_DOUBLE:
11150
11151       /* First determine number of instructions to do the move
11152           as an integer constant.  */
11153       if (!aarch64_float_const_representable_p (x)
11154            && !aarch64_can_const_movi_rtx_p (x, mode)
11155            && aarch64_float_const_rtx_p (x))
11156         {
11157           unsigned HOST_WIDE_INT ival;
11158           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
11159           gcc_assert (succeed);
11160
11161           scalar_int_mode imode = (mode == HFmode
11162                                    ? SImode
11163                                    : int_mode_for_mode (mode).require ());
11164           int ncost = aarch64_internal_mov_immediate
11165                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
11166           *cost += COSTS_N_INSNS (ncost);
11167           return true;
11168         }
11169
11170       if (speed)
11171         {
11172           /* mov[df,sf]_aarch64.  */
11173           if (aarch64_float_const_representable_p (x))
11174             /* FMOV (scalar immediate).  */
11175             *cost += extra_cost->fp[mode == DFmode].fpconst;
11176           else if (!aarch64_float_const_zero_rtx_p (x))
11177             {
11178               /* This will be a load from memory.  */
11179               if (mode == DFmode)
11180                 *cost += extra_cost->ldst.loadd;
11181               else
11182                 *cost += extra_cost->ldst.loadf;
11183             }
11184           else
11185             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
11186                or MOV v0.s[0], wzr - neither of which are modeled by the
11187                cost tables.  Just use the default cost.  */
11188             {
11189             }
11190         }
11191
11192       return true;
11193
11194     case MEM:
11195       if (speed)
11196         {
11197           /* For loads we want the base cost of a load, plus an
11198              approximation for the additional cost of the addressing
11199              mode.  */
11200           rtx address = XEXP (x, 0);
11201           if (VECTOR_MODE_P (mode))
11202             *cost += extra_cost->ldst.loadv;
11203           else if (GET_MODE_CLASS (mode) == MODE_INT)
11204             *cost += extra_cost->ldst.load;
11205           else if (mode == SFmode)
11206             *cost += extra_cost->ldst.loadf;
11207           else if (mode == DFmode)
11208             *cost += extra_cost->ldst.loadd;
11209
11210           *cost +=
11211                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11212                                                      0, speed));
11213         }
11214
11215       return true;
11216
11217     case NEG:
11218       op0 = XEXP (x, 0);
11219
11220       if (VECTOR_MODE_P (mode))
11221         {
11222           if (speed)
11223             {
11224               /* FNEG.  */
11225               *cost += extra_cost->vect.alu;
11226             }
11227           return false;
11228         }
11229
11230       if (GET_MODE_CLASS (mode) == MODE_INT)
11231         {
11232           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
11233               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
11234             {
11235               /* CSETM.  */
11236               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
11237               return true;
11238             }
11239
11240           /* Cost this as SUB wzr, X.  */
11241           op0 = CONST0_RTX (mode);
11242           op1 = XEXP (x, 0);
11243           goto cost_minus;
11244         }
11245
11246       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11247         {
11248           /* Support (neg(fma...)) as a single instruction only if
11249              sign of zeros is unimportant.  This matches the decision
11250              making in aarch64.md.  */
11251           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
11252             {
11253               /* FNMADD.  */
11254               *cost = rtx_cost (op0, mode, NEG, 0, speed);
11255               return true;
11256             }
11257           if (GET_CODE (op0) == MULT)
11258             {
11259               /* FNMUL.  */
11260               *cost = rtx_cost (op0, mode, NEG, 0, speed);
11261               return true;
11262             }
11263           if (speed)
11264             /* FNEG.  */
11265             *cost += extra_cost->fp[mode == DFmode].neg;
11266           return false;
11267         }
11268
11269       return false;
11270
11271     case CLRSB:
11272     case CLZ:
11273       if (speed)
11274         {
11275           if (VECTOR_MODE_P (mode))
11276             *cost += extra_cost->vect.alu;
11277           else
11278             *cost += extra_cost->alu.clz;
11279         }
11280
11281       return false;
11282
11283     case COMPARE:
11284       op0 = XEXP (x, 0);
11285       op1 = XEXP (x, 1);
11286
11287       if (op1 == const0_rtx
11288           && GET_CODE (op0) == AND)
11289         {
11290           x = op0;
11291           mode = GET_MODE (op0);
11292           goto cost_logic;
11293         }
11294
11295       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
11296         {
11297           /* TODO: A write to the CC flags possibly costs extra, this
11298              needs encoding in the cost tables.  */
11299
11300           mode = GET_MODE (op0);
11301           /* ANDS.  */
11302           if (GET_CODE (op0) == AND)
11303             {
11304               x = op0;
11305               goto cost_logic;
11306             }
11307
11308           if (GET_CODE (op0) == PLUS)
11309             {
11310               /* ADDS (and CMN alias).  */
11311               x = op0;
11312               goto cost_plus;
11313             }
11314
11315           if (GET_CODE (op0) == MINUS)
11316             {
11317               /* SUBS.  */
11318               x = op0;
11319               goto cost_minus;
11320             }
11321
11322           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
11323               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
11324               && CONST_INT_P (XEXP (op0, 2)))
11325             {
11326               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
11327                  Handle it here directly rather than going to cost_logic
11328                  since we know the immediate generated for the TST is valid
11329                  so we can avoid creating an intermediate rtx for it only
11330                  for costing purposes.  */
11331               if (speed)
11332                 *cost += extra_cost->alu.logical;
11333
11334               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
11335                                  ZERO_EXTRACT, 0, speed);
11336               return true;
11337             }
11338
11339           if (GET_CODE (op1) == NEG)
11340             {
11341               /* CMN.  */
11342               if (speed)
11343                 *cost += extra_cost->alu.arith;
11344
11345               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
11346               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
11347               return true;
11348             }
11349
11350           /* CMP.
11351
11352              Compare can freely swap the order of operands, and
11353              canonicalization puts the more complex operation first.
11354              But the integer MINUS logic expects the shift/extend
11355              operation in op1.  */
11356           if (! (REG_P (op0)
11357                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
11358           {
11359             op0 = XEXP (x, 1);
11360             op1 = XEXP (x, 0);
11361           }
11362           goto cost_minus;
11363         }
11364
11365       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
11366         {
11367           /* FCMP.  */
11368           if (speed)
11369             *cost += extra_cost->fp[mode == DFmode].compare;
11370
11371           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
11372             {
11373               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
11374               /* FCMP supports constant 0.0 for no extra cost. */
11375               return true;
11376             }
11377           return false;
11378         }
11379
11380       if (VECTOR_MODE_P (mode))
11381         {
11382           /* Vector compare.  */
11383           if (speed)
11384             *cost += extra_cost->vect.alu;
11385
11386           if (aarch64_float_const_zero_rtx_p (op1))
11387             {
11388               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
11389                  cost.  */
11390               return true;
11391             }
11392           return false;
11393         }
11394       return false;
11395
11396     case MINUS:
11397       {
11398         op0 = XEXP (x, 0);
11399         op1 = XEXP (x, 1);
11400
11401 cost_minus:
11402         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
11403
11404         /* Detect valid immediates.  */
11405         if ((GET_MODE_CLASS (mode) == MODE_INT
11406              || (GET_MODE_CLASS (mode) == MODE_CC
11407                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
11408             && CONST_INT_P (op1)
11409             && aarch64_uimm12_shift (INTVAL (op1)))
11410           {
11411             if (speed)
11412               /* SUB(S) (immediate).  */
11413               *cost += extra_cost->alu.arith;
11414             return true;
11415           }
11416
11417         /* Look for SUB (extended register).  */
11418         if (is_a <scalar_int_mode> (mode, &int_mode)
11419             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
11420           {
11421             if (speed)
11422               *cost += extra_cost->alu.extend_arith;
11423
11424             op1 = aarch64_strip_extend (op1, true);
11425             *cost += rtx_cost (op1, VOIDmode,
11426                                (enum rtx_code) GET_CODE (op1), 0, speed);
11427             return true;
11428           }
11429
11430         rtx new_op1 = aarch64_strip_extend (op1, false);
11431
11432         /* Cost this as an FMA-alike operation.  */
11433         if ((GET_CODE (new_op1) == MULT
11434              || aarch64_shift_p (GET_CODE (new_op1)))
11435             && code != COMPARE)
11436           {
11437             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
11438                                             (enum rtx_code) code,
11439                                             speed);
11440             return true;
11441           }
11442
11443         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
11444
11445         if (speed)
11446           {
11447             if (VECTOR_MODE_P (mode))
11448               {
11449                 /* Vector SUB.  */
11450                 *cost += extra_cost->vect.alu;
11451               }
11452             else if (GET_MODE_CLASS (mode) == MODE_INT)
11453               {
11454                 /* SUB(S).  */
11455                 *cost += extra_cost->alu.arith;
11456               }
11457             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11458               {
11459                 /* FSUB.  */
11460                 *cost += extra_cost->fp[mode == DFmode].addsub;
11461               }
11462           }
11463         return true;
11464       }
11465
11466     case PLUS:
11467       {
11468         rtx new_op0;
11469
11470         op0 = XEXP (x, 0);
11471         op1 = XEXP (x, 1);
11472
11473 cost_plus:
11474         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
11475             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
11476           {
11477             /* CSINC.  */
11478             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
11479             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
11480             return true;
11481           }
11482
11483         if (GET_MODE_CLASS (mode) == MODE_INT
11484             && (aarch64_plus_immediate (op1, mode)
11485                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
11486           {
11487             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
11488
11489             if (speed)
11490               /* ADD (immediate).  */
11491               *cost += extra_cost->alu.arith;
11492             return true;
11493           }
11494
11495         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
11496
11497         /* Look for ADD (extended register).  */
11498         if (is_a <scalar_int_mode> (mode, &int_mode)
11499             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
11500           {
11501             if (speed)
11502               *cost += extra_cost->alu.extend_arith;
11503
11504             op0 = aarch64_strip_extend (op0, true);
11505             *cost += rtx_cost (op0, VOIDmode,
11506                                (enum rtx_code) GET_CODE (op0), 0, speed);
11507             return true;
11508           }
11509
11510         /* Strip any extend, leave shifts behind as we will
11511            cost them through mult_cost.  */
11512         new_op0 = aarch64_strip_extend (op0, false);
11513
11514         if (GET_CODE (new_op0) == MULT
11515             || aarch64_shift_p (GET_CODE (new_op0)))
11516           {
11517             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
11518                                             speed);
11519             return true;
11520           }
11521
11522         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
11523
11524         if (speed)
11525           {
11526             if (VECTOR_MODE_P (mode))
11527               {
11528                 /* Vector ADD.  */
11529                 *cost += extra_cost->vect.alu;
11530               }
11531             else if (GET_MODE_CLASS (mode) == MODE_INT)
11532               {
11533                 /* ADD.  */
11534                 *cost += extra_cost->alu.arith;
11535               }
11536             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11537               {
11538                 /* FADD.  */
11539                 *cost += extra_cost->fp[mode == DFmode].addsub;
11540               }
11541           }
11542         return true;
11543       }
11544
11545     case BSWAP:
11546       *cost = COSTS_N_INSNS (1);
11547
11548       if (speed)
11549         {
11550           if (VECTOR_MODE_P (mode))
11551             *cost += extra_cost->vect.alu;
11552           else
11553             *cost += extra_cost->alu.rev;
11554         }
11555       return false;
11556
11557     case IOR:
11558       if (aarch_rev16_p (x))
11559         {
11560           *cost = COSTS_N_INSNS (1);
11561
11562           if (speed)
11563             {
11564               if (VECTOR_MODE_P (mode))
11565                 *cost += extra_cost->vect.alu;
11566               else
11567                 *cost += extra_cost->alu.rev;
11568             }
11569           return true;
11570         }
11571
11572       if (aarch64_extr_rtx_p (x, &op0, &op1))
11573         {
11574           *cost += rtx_cost (op0, mode, IOR, 0, speed);
11575           *cost += rtx_cost (op1, mode, IOR, 1, speed);
11576           if (speed)
11577             *cost += extra_cost->alu.shift;
11578
11579           return true;
11580         }
11581     /* Fall through.  */
11582     case XOR:
11583     case AND:
11584     cost_logic:
11585       op0 = XEXP (x, 0);
11586       op1 = XEXP (x, 1);
11587
11588       if (VECTOR_MODE_P (mode))
11589         {
11590           if (speed)
11591             *cost += extra_cost->vect.alu;
11592           return true;
11593         }
11594
11595       if (code == AND
11596           && GET_CODE (op0) == MULT
11597           && CONST_INT_P (XEXP (op0, 1))
11598           && CONST_INT_P (op1)
11599           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
11600                                INTVAL (op1)) != 0)
11601         {
11602           /* This is a UBFM/SBFM.  */
11603           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
11604           if (speed)
11605             *cost += extra_cost->alu.bfx;
11606           return true;
11607         }
11608
11609       if (is_int_mode (mode, &int_mode))
11610         {
11611           if (CONST_INT_P (op1))
11612             {
11613               /* We have a mask + shift version of a UBFIZ
11614                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
11615               if (GET_CODE (op0) == ASHIFT
11616                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
11617                                                          XEXP (op0, 1)))
11618                 {
11619                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
11620                                      (enum rtx_code) code, 0, speed);
11621                   if (speed)
11622                     *cost += extra_cost->alu.bfx;
11623
11624                   return true;
11625                 }
11626               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
11627                 {
11628                 /* We possibly get the immediate for free, this is not
11629                    modelled.  */
11630                   *cost += rtx_cost (op0, int_mode,
11631                                      (enum rtx_code) code, 0, speed);
11632                   if (speed)
11633                     *cost += extra_cost->alu.logical;
11634
11635                   return true;
11636                 }
11637             }
11638           else
11639             {
11640               rtx new_op0 = op0;
11641
11642               /* Handle ORN, EON, or BIC.  */
11643               if (GET_CODE (op0) == NOT)
11644                 op0 = XEXP (op0, 0);
11645
11646               new_op0 = aarch64_strip_shift (op0);
11647
11648               /* If we had a shift on op0 then this is a logical-shift-
11649                  by-register/immediate operation.  Otherwise, this is just
11650                  a logical operation.  */
11651               if (speed)
11652                 {
11653                   if (new_op0 != op0)
11654                     {
11655                       /* Shift by immediate.  */
11656                       if (CONST_INT_P (XEXP (op0, 1)))
11657                         *cost += extra_cost->alu.log_shift;
11658                       else
11659                         *cost += extra_cost->alu.log_shift_reg;
11660                     }
11661                   else
11662                     *cost += extra_cost->alu.logical;
11663                 }
11664
11665               /* In both cases we want to cost both operands.  */
11666               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
11667                                  0, speed);
11668               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
11669                                  1, speed);
11670
11671               return true;
11672             }
11673         }
11674       return false;
11675
11676     case NOT:
11677       x = XEXP (x, 0);
11678       op0 = aarch64_strip_shift (x);
11679
11680       if (VECTOR_MODE_P (mode))
11681         {
11682           /* Vector NOT.  */
11683           *cost += extra_cost->vect.alu;
11684           return false;
11685         }
11686
11687       /* MVN-shifted-reg.  */
11688       if (op0 != x)
11689         {
11690           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
11691
11692           if (speed)
11693             *cost += extra_cost->alu.log_shift;
11694
11695           return true;
11696         }
11697       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
11698          Handle the second form here taking care that 'a' in the above can
11699          be a shift.  */
11700       else if (GET_CODE (op0) == XOR)
11701         {
11702           rtx newop0 = XEXP (op0, 0);
11703           rtx newop1 = XEXP (op0, 1);
11704           rtx op0_stripped = aarch64_strip_shift (newop0);
11705
11706           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
11707           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
11708
11709           if (speed)
11710             {
11711               if (op0_stripped != newop0)
11712                 *cost += extra_cost->alu.log_shift;
11713               else
11714                 *cost += extra_cost->alu.logical;
11715             }
11716
11717           return true;
11718         }
11719       /* MVN.  */
11720       if (speed)
11721         *cost += extra_cost->alu.logical;
11722
11723       return false;
11724
11725     case ZERO_EXTEND:
11726
11727       op0 = XEXP (x, 0);
11728       /* If a value is written in SI mode, then zero extended to DI
11729          mode, the operation will in general be free as a write to
11730          a 'w' register implicitly zeroes the upper bits of an 'x'
11731          register.  However, if this is
11732
11733            (set (reg) (zero_extend (reg)))
11734
11735          we must cost the explicit register move.  */
11736       if (mode == DImode
11737           && GET_MODE (op0) == SImode
11738           && outer == SET)
11739         {
11740           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
11741
11742         /* If OP_COST is non-zero, then the cost of the zero extend
11743            is effectively the cost of the inner operation.  Otherwise
11744            we have a MOV instruction and we take the cost from the MOV
11745            itself.  This is true independently of whether we are
11746            optimizing for space or time.  */
11747           if (op_cost)
11748             *cost = op_cost;
11749
11750           return true;
11751         }
11752       else if (MEM_P (op0))
11753         {
11754           /* All loads can zero extend to any size for free.  */
11755           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
11756           return true;
11757         }
11758
11759       op0 = aarch64_extend_bitfield_pattern_p (x);
11760       if (op0)
11761         {
11762           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
11763           if (speed)
11764             *cost += extra_cost->alu.bfx;
11765           return true;
11766         }
11767
11768       if (speed)
11769         {
11770           if (VECTOR_MODE_P (mode))
11771             {
11772               /* UMOV.  */
11773               *cost += extra_cost->vect.alu;
11774             }
11775           else
11776             {
11777               /* We generate an AND instead of UXTB/UXTH.  */
11778               *cost += extra_cost->alu.logical;
11779             }
11780         }
11781       return false;
11782
11783     case SIGN_EXTEND:
11784       if (MEM_P (XEXP (x, 0)))
11785         {
11786           /* LDRSH.  */
11787           if (speed)
11788             {
11789               rtx address = XEXP (XEXP (x, 0), 0);
11790               *cost += extra_cost->ldst.load_sign_extend;
11791
11792               *cost +=
11793                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11794                                                      0, speed));
11795             }
11796           return true;
11797         }
11798
11799       op0 = aarch64_extend_bitfield_pattern_p (x);
11800       if (op0)
11801         {
11802           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
11803           if (speed)
11804             *cost += extra_cost->alu.bfx;
11805           return true;
11806         }
11807
11808       if (speed)
11809         {
11810           if (VECTOR_MODE_P (mode))
11811             *cost += extra_cost->vect.alu;
11812           else
11813             *cost += extra_cost->alu.extend;
11814         }
11815       return false;
11816
11817     case ASHIFT:
11818       op0 = XEXP (x, 0);
11819       op1 = XEXP (x, 1);
11820
11821       if (CONST_INT_P (op1))
11822         {
11823           if (speed)
11824             {
11825               if (VECTOR_MODE_P (mode))
11826                 {
11827                   /* Vector shift (immediate).  */
11828                   *cost += extra_cost->vect.alu;
11829                 }
11830               else
11831                 {
11832                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
11833                      aliases.  */
11834                   *cost += extra_cost->alu.shift;
11835                 }
11836             }
11837
11838           /* We can incorporate zero/sign extend for free.  */
11839           if (GET_CODE (op0) == ZERO_EXTEND
11840               || GET_CODE (op0) == SIGN_EXTEND)
11841             op0 = XEXP (op0, 0);
11842
11843           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
11844           return true;
11845         }
11846       else
11847         {
11848           if (VECTOR_MODE_P (mode))
11849             {
11850               if (speed)
11851                 /* Vector shift (register).  */
11852                 *cost += extra_cost->vect.alu;
11853             }
11854           else
11855             {
11856               if (speed)
11857                 /* LSLV.  */
11858                 *cost += extra_cost->alu.shift_reg;
11859
11860               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11861                   && CONST_INT_P (XEXP (op1, 1))
11862                   && known_eq (INTVAL (XEXP (op1, 1)),
11863                                GET_MODE_BITSIZE (mode) - 1))
11864                 {
11865                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11866                   /* We already demanded XEXP (op1, 0) to be REG_P, so
11867                      don't recurse into it.  */
11868                   return true;
11869                 }
11870             }
11871           return false;  /* All arguments need to be in registers.  */
11872         }
11873
11874     case ROTATE:
11875     case ROTATERT:
11876     case LSHIFTRT:
11877     case ASHIFTRT:
11878       op0 = XEXP (x, 0);
11879       op1 = XEXP (x, 1);
11880
11881       if (CONST_INT_P (op1))
11882         {
11883           /* ASR (immediate) and friends.  */
11884           if (speed)
11885             {
11886               if (VECTOR_MODE_P (mode))
11887                 *cost += extra_cost->vect.alu;
11888               else
11889                 *cost += extra_cost->alu.shift;
11890             }
11891
11892           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
11893           return true;
11894         }
11895       else
11896         {
11897           if (VECTOR_MODE_P (mode))
11898             {
11899               if (speed)
11900                 /* Vector shift (register).  */
11901                 *cost += extra_cost->vect.alu;
11902             }
11903           else
11904             {
11905               if (speed)
11906                 /* ASR (register) and friends.  */
11907                 *cost += extra_cost->alu.shift_reg;
11908
11909               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11910                   && CONST_INT_P (XEXP (op1, 1))
11911                   && known_eq (INTVAL (XEXP (op1, 1)),
11912                                GET_MODE_BITSIZE (mode) - 1))
11913                 {
11914                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11915                   /* We already demanded XEXP (op1, 0) to be REG_P, so
11916                      don't recurse into it.  */
11917                   return true;
11918                 }
11919             }
11920           return false;  /* All arguments need to be in registers.  */
11921         }
11922
11923     case SYMBOL_REF:
11924
11925       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
11926           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
11927         {
11928           /* LDR.  */
11929           if (speed)
11930             *cost += extra_cost->ldst.load;
11931         }
11932       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
11933                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
11934         {
11935           /* ADRP, followed by ADD.  */
11936           *cost += COSTS_N_INSNS (1);
11937           if (speed)
11938             *cost += 2 * extra_cost->alu.arith;
11939         }
11940       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
11941                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11942         {
11943           /* ADR.  */
11944           if (speed)
11945             *cost += extra_cost->alu.arith;
11946         }
11947
11948       if (flag_pic)
11949         {
11950           /* One extra load instruction, after accessing the GOT.  */
11951           *cost += COSTS_N_INSNS (1);
11952           if (speed)
11953             *cost += extra_cost->ldst.load;
11954         }
11955       return true;
11956
11957     case HIGH:
11958     case LO_SUM:
11959       /* ADRP/ADD (immediate).  */
11960       if (speed)
11961         *cost += extra_cost->alu.arith;
11962       return true;
11963
11964     case ZERO_EXTRACT:
11965     case SIGN_EXTRACT:
11966       /* UBFX/SBFX.  */
11967       if (speed)
11968         {
11969           if (VECTOR_MODE_P (mode))
11970             *cost += extra_cost->vect.alu;
11971           else
11972             *cost += extra_cost->alu.bfx;
11973         }
11974
11975       /* We can trust that the immediates used will be correct (there
11976          are no by-register forms), so we need only cost op0.  */
11977       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
11978       return true;
11979
11980     case MULT:
11981       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
11982       /* aarch64_rtx_mult_cost always handles recursion to its
11983          operands.  */
11984       return true;
11985
11986     case MOD:
11987     /* We can expand signed mod by power of 2 using a NEGS, two parallel
11988        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
11989        an unconditional negate.  This case should only ever be reached through
11990        the set_smod_pow2_cheap check in expmed.c.  */
11991       if (CONST_INT_P (XEXP (x, 1))
11992           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
11993           && (mode == SImode || mode == DImode))
11994         {
11995           /* We expand to 4 instructions.  Reset the baseline.  */
11996           *cost = COSTS_N_INSNS (4);
11997
11998           if (speed)
11999             *cost += 2 * extra_cost->alu.logical
12000                      + 2 * extra_cost->alu.arith;
12001
12002           return true;
12003         }
12004
12005     /* Fall-through.  */
12006     case UMOD:
12007       if (speed)
12008         {
12009           /* Slighly prefer UMOD over SMOD.  */
12010           if (VECTOR_MODE_P (mode))
12011             *cost += extra_cost->vect.alu;
12012           else if (GET_MODE_CLASS (mode) == MODE_INT)
12013             *cost += (extra_cost->mult[mode == DImode].add
12014                       + extra_cost->mult[mode == DImode].idiv
12015                       + (code == MOD ? 1 : 0));
12016         }
12017       return false;  /* All arguments need to be in registers.  */
12018
12019     case DIV:
12020     case UDIV:
12021     case SQRT:
12022       if (speed)
12023         {
12024           if (VECTOR_MODE_P (mode))
12025             *cost += extra_cost->vect.alu;
12026           else if (GET_MODE_CLASS (mode) == MODE_INT)
12027             /* There is no integer SQRT, so only DIV and UDIV can get
12028                here.  */
12029             *cost += (extra_cost->mult[mode == DImode].idiv
12030                      /* Slighly prefer UDIV over SDIV.  */
12031                      + (code == DIV ? 1 : 0));
12032           else
12033             *cost += extra_cost->fp[mode == DFmode].div;
12034         }
12035       return false;  /* All arguments need to be in registers.  */
12036
12037     case IF_THEN_ELSE:
12038       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
12039                                          XEXP (x, 2), cost, speed);
12040
12041     case EQ:
12042     case NE:
12043     case GT:
12044     case GTU:
12045     case LT:
12046     case LTU:
12047     case GE:
12048     case GEU:
12049     case LE:
12050     case LEU:
12051
12052       return false; /* All arguments must be in registers.  */
12053
12054     case FMA:
12055       op0 = XEXP (x, 0);
12056       op1 = XEXP (x, 1);
12057       op2 = XEXP (x, 2);
12058
12059       if (speed)
12060         {
12061           if (VECTOR_MODE_P (mode))
12062             *cost += extra_cost->vect.alu;
12063           else
12064             *cost += extra_cost->fp[mode == DFmode].fma;
12065         }
12066
12067       /* FMSUB, FNMADD, and FNMSUB are free.  */
12068       if (GET_CODE (op0) == NEG)
12069         op0 = XEXP (op0, 0);
12070
12071       if (GET_CODE (op2) == NEG)
12072         op2 = XEXP (op2, 0);
12073
12074       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
12075          and the by-element operand as operand 0.  */
12076       if (GET_CODE (op1) == NEG)
12077         op1 = XEXP (op1, 0);
12078
12079       /* Catch vector-by-element operations.  The by-element operand can
12080          either be (vec_duplicate (vec_select (x))) or just
12081          (vec_select (x)), depending on whether we are multiplying by
12082          a vector or a scalar.
12083
12084          Canonicalization is not very good in these cases, FMA4 will put the
12085          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
12086       if (GET_CODE (op0) == VEC_DUPLICATE)
12087         op0 = XEXP (op0, 0);
12088       else if (GET_CODE (op1) == VEC_DUPLICATE)
12089         op1 = XEXP (op1, 0);
12090
12091       if (GET_CODE (op0) == VEC_SELECT)
12092         op0 = XEXP (op0, 0);
12093       else if (GET_CODE (op1) == VEC_SELECT)
12094         op1 = XEXP (op1, 0);
12095
12096       /* If the remaining parameters are not registers,
12097          get the cost to put them into registers.  */
12098       *cost += rtx_cost (op0, mode, FMA, 0, speed);
12099       *cost += rtx_cost (op1, mode, FMA, 1, speed);
12100       *cost += rtx_cost (op2, mode, FMA, 2, speed);
12101       return true;
12102
12103     case FLOAT:
12104     case UNSIGNED_FLOAT:
12105       if (speed)
12106         *cost += extra_cost->fp[mode == DFmode].fromint;
12107       return false;
12108
12109     case FLOAT_EXTEND:
12110       if (speed)
12111         {
12112           if (VECTOR_MODE_P (mode))
12113             {
12114               /*Vector truncate.  */
12115               *cost += extra_cost->vect.alu;
12116             }
12117           else
12118             *cost += extra_cost->fp[mode == DFmode].widen;
12119         }
12120       return false;
12121
12122     case FLOAT_TRUNCATE:
12123       if (speed)
12124         {
12125           if (VECTOR_MODE_P (mode))
12126             {
12127               /*Vector conversion.  */
12128               *cost += extra_cost->vect.alu;
12129             }
12130           else
12131             *cost += extra_cost->fp[mode == DFmode].narrow;
12132         }
12133       return false;
12134
12135     case FIX:
12136     case UNSIGNED_FIX:
12137       x = XEXP (x, 0);
12138       /* Strip the rounding part.  They will all be implemented
12139          by the fcvt* family of instructions anyway.  */
12140       if (GET_CODE (x) == UNSPEC)
12141         {
12142           unsigned int uns_code = XINT (x, 1);
12143
12144           if (uns_code == UNSPEC_FRINTA
12145               || uns_code == UNSPEC_FRINTM
12146               || uns_code == UNSPEC_FRINTN
12147               || uns_code == UNSPEC_FRINTP
12148               || uns_code == UNSPEC_FRINTZ)
12149             x = XVECEXP (x, 0, 0);
12150         }
12151
12152       if (speed)
12153         {
12154           if (VECTOR_MODE_P (mode))
12155             *cost += extra_cost->vect.alu;
12156           else
12157             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
12158         }
12159
12160       /* We can combine fmul by a power of 2 followed by a fcvt into a single
12161          fixed-point fcvt.  */
12162       if (GET_CODE (x) == MULT
12163           && ((VECTOR_MODE_P (mode)
12164                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
12165               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
12166         {
12167           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
12168                              0, speed);
12169           return true;
12170         }
12171
12172       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
12173       return true;
12174
12175     case ABS:
12176       if (VECTOR_MODE_P (mode))
12177         {
12178           /* ABS (vector).  */
12179           if (speed)
12180             *cost += extra_cost->vect.alu;
12181         }
12182       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12183         {
12184           op0 = XEXP (x, 0);
12185
12186           /* FABD, which is analogous to FADD.  */
12187           if (GET_CODE (op0) == MINUS)
12188             {
12189               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
12190               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
12191               if (speed)
12192                 *cost += extra_cost->fp[mode == DFmode].addsub;
12193
12194               return true;
12195             }
12196           /* Simple FABS is analogous to FNEG.  */
12197           if (speed)
12198             *cost += extra_cost->fp[mode == DFmode].neg;
12199         }
12200       else
12201         {
12202           /* Integer ABS will either be split to
12203              two arithmetic instructions, or will be an ABS
12204              (scalar), which we don't model.  */
12205           *cost = COSTS_N_INSNS (2);
12206           if (speed)
12207             *cost += 2 * extra_cost->alu.arith;
12208         }
12209       return false;
12210
12211     case SMAX:
12212     case SMIN:
12213       if (speed)
12214         {
12215           if (VECTOR_MODE_P (mode))
12216             *cost += extra_cost->vect.alu;
12217           else
12218             {
12219               /* FMAXNM/FMINNM/FMAX/FMIN.
12220                  TODO: This may not be accurate for all implementations, but
12221                  we do not model this in the cost tables.  */
12222               *cost += extra_cost->fp[mode == DFmode].addsub;
12223             }
12224         }
12225       return false;
12226
12227     case UNSPEC:
12228       /* The floating point round to integer frint* instructions.  */
12229       if (aarch64_frint_unspec_p (XINT (x, 1)))
12230         {
12231           if (speed)
12232             *cost += extra_cost->fp[mode == DFmode].roundint;
12233
12234           return false;
12235         }
12236
12237       if (XINT (x, 1) == UNSPEC_RBIT)
12238         {
12239           if (speed)
12240             *cost += extra_cost->alu.rev;
12241
12242           return false;
12243         }
12244       break;
12245
12246     case TRUNCATE:
12247
12248       /* Decompose <su>muldi3_highpart.  */
12249       if (/* (truncate:DI  */
12250           mode == DImode
12251           /*   (lshiftrt:TI  */
12252           && GET_MODE (XEXP (x, 0)) == TImode
12253           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
12254           /*      (mult:TI  */
12255           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
12256           /*        (ANY_EXTEND:TI (reg:DI))
12257                     (ANY_EXTEND:TI (reg:DI)))  */
12258           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
12259                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
12260               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
12261                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
12262           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
12263           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
12264           /*     (const_int 64)  */
12265           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
12266           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
12267         {
12268           /* UMULH/SMULH.  */
12269           if (speed)
12270             *cost += extra_cost->mult[mode == DImode].extend;
12271           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
12272                              mode, MULT, 0, speed);
12273           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
12274                              mode, MULT, 1, speed);
12275           return true;
12276         }
12277
12278       /* Fall through.  */
12279     default:
12280       break;
12281     }
12282
12283   if (dump_file
12284       && flag_aarch64_verbose_cost)
12285     fprintf (dump_file,
12286       "\nFailed to cost RTX.  Assuming default cost.\n");
12287
12288   return true;
12289 }
12290
12291 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
12292    calculated for X.  This cost is stored in *COST.  Returns true
12293    if the total cost of X was calculated.  */
12294 static bool
12295 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
12296                    int param, int *cost, bool speed)
12297 {
12298   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
12299
12300   if (dump_file
12301       && flag_aarch64_verbose_cost)
12302     {
12303       print_rtl_single (dump_file, x);
12304       fprintf (dump_file, "\n%s cost: %d (%s)\n",
12305                speed ? "Hot" : "Cold",
12306                *cost, result ? "final" : "partial");
12307     }
12308
12309   return result;
12310 }
12311
12312 static int
12313 aarch64_register_move_cost (machine_mode mode,
12314                             reg_class_t from_i, reg_class_t to_i)
12315 {
12316   enum reg_class from = (enum reg_class) from_i;
12317   enum reg_class to = (enum reg_class) to_i;
12318   const struct cpu_regmove_cost *regmove_cost
12319     = aarch64_tune_params.regmove_cost;
12320
12321   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
12322   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
12323     to = GENERAL_REGS;
12324
12325   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
12326     from = GENERAL_REGS;
12327
12328   /* Make RDFFR very expensive.  In particular, if we know that the FFR
12329      contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
12330      as a way of obtaining a PTRUE.  */
12331   if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
12332       && hard_reg_set_subset_p (reg_class_contents[from_i],
12333                                 reg_class_contents[FFR_REGS]))
12334     return 80;
12335
12336   /* Moving between GPR and stack cost is the same as GP2GP.  */
12337   if ((from == GENERAL_REGS && to == STACK_REG)
12338       || (to == GENERAL_REGS && from == STACK_REG))
12339     return regmove_cost->GP2GP;
12340
12341   /* To/From the stack register, we move via the gprs.  */
12342   if (to == STACK_REG || from == STACK_REG)
12343     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
12344             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
12345
12346   if (known_eq (GET_MODE_SIZE (mode), 16))
12347     {
12348       /* 128-bit operations on general registers require 2 instructions.  */
12349       if (from == GENERAL_REGS && to == GENERAL_REGS)
12350         return regmove_cost->GP2GP * 2;
12351       else if (from == GENERAL_REGS)
12352         return regmove_cost->GP2FP * 2;
12353       else if (to == GENERAL_REGS)
12354         return regmove_cost->FP2GP * 2;
12355
12356       /* When AdvSIMD instructions are disabled it is not possible to move
12357          a 128-bit value directly between Q registers.  This is handled in
12358          secondary reload.  A general register is used as a scratch to move
12359          the upper DI value and the lower DI value is moved directly,
12360          hence the cost is the sum of three moves. */
12361       if (! TARGET_SIMD)
12362         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
12363
12364       return regmove_cost->FP2FP;
12365     }
12366
12367   if (from == GENERAL_REGS && to == GENERAL_REGS)
12368     return regmove_cost->GP2GP;
12369   else if (from == GENERAL_REGS)
12370     return regmove_cost->GP2FP;
12371   else if (to == GENERAL_REGS)
12372     return regmove_cost->FP2GP;
12373
12374   return regmove_cost->FP2FP;
12375 }
12376
12377 static int
12378 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
12379                           reg_class_t rclass ATTRIBUTE_UNUSED,
12380                           bool in ATTRIBUTE_UNUSED)
12381 {
12382   return aarch64_tune_params.memmov_cost;
12383 }
12384
12385 /* Implement TARGET_INIT_BUILTINS.  */
12386 static void
12387 aarch64_init_builtins ()
12388 {
12389   aarch64_general_init_builtins ();
12390   aarch64_sve::init_builtins ();
12391 }
12392
12393 /* Implement TARGET_FOLD_BUILTIN.  */
12394 static tree
12395 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
12396 {
12397   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
12398   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12399   tree type = TREE_TYPE (TREE_TYPE (fndecl));
12400   switch (code & AARCH64_BUILTIN_CLASS)
12401     {
12402     case AARCH64_BUILTIN_GENERAL:
12403       return aarch64_general_fold_builtin (subcode, type, nargs, args);
12404
12405     case AARCH64_BUILTIN_SVE:
12406       return NULL_TREE;
12407     }
12408   gcc_unreachable ();
12409 }
12410
12411 /* Implement TARGET_GIMPLE_FOLD_BUILTIN.  */
12412 static bool
12413 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
12414 {
12415   gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
12416   tree fndecl = gimple_call_fndecl (stmt);
12417   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
12418   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12419   gimple *new_stmt = NULL;
12420   switch (code & AARCH64_BUILTIN_CLASS)
12421     {
12422     case AARCH64_BUILTIN_GENERAL:
12423       new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt);
12424       break;
12425
12426     case AARCH64_BUILTIN_SVE:
12427       new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
12428       break;
12429     }
12430
12431   if (!new_stmt)
12432     return false;
12433
12434   gsi_replace (gsi, new_stmt, true);
12435   return true;
12436 }
12437
12438 /* Implement TARGET_EXPAND_BUILTIN.  */
12439 static rtx
12440 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
12441 {
12442   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
12443   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
12444   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12445   switch (code & AARCH64_BUILTIN_CLASS)
12446     {
12447     case AARCH64_BUILTIN_GENERAL:
12448       return aarch64_general_expand_builtin (subcode, exp, target, ignore);
12449
12450     case AARCH64_BUILTIN_SVE:
12451       return aarch64_sve::expand_builtin (subcode, exp, target);
12452     }
12453   gcc_unreachable ();
12454 }
12455
12456 /* Implement TARGET_BUILTIN_DECL.  */
12457 static tree
12458 aarch64_builtin_decl (unsigned int code, bool initialize_p)
12459 {
12460   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12461   switch (code & AARCH64_BUILTIN_CLASS)
12462     {
12463     case AARCH64_BUILTIN_GENERAL:
12464       return aarch64_general_builtin_decl (subcode, initialize_p);
12465
12466     case AARCH64_BUILTIN_SVE:
12467       return aarch64_sve::builtin_decl (subcode, initialize_p);
12468     }
12469   gcc_unreachable ();
12470 }
12471
12472 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
12473    to optimize 1.0/sqrt.  */
12474
12475 static bool
12476 use_rsqrt_p (machine_mode mode)
12477 {
12478   return (!flag_trapping_math
12479           && flag_unsafe_math_optimizations
12480           && ((aarch64_tune_params.approx_modes->recip_sqrt
12481                & AARCH64_APPROX_MODE (mode))
12482               || flag_mrecip_low_precision_sqrt));
12483 }
12484
12485 /* Function to decide when to use the approximate reciprocal square root
12486    builtin.  */
12487
12488 static tree
12489 aarch64_builtin_reciprocal (tree fndecl)
12490 {
12491   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
12492
12493   if (!use_rsqrt_p (mode))
12494     return NULL_TREE;
12495   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
12496   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12497   switch (code & AARCH64_BUILTIN_CLASS)
12498     {
12499     case AARCH64_BUILTIN_GENERAL:
12500       return aarch64_general_builtin_rsqrt (subcode);
12501
12502     case AARCH64_BUILTIN_SVE:
12503       return NULL_TREE;
12504     }
12505   gcc_unreachable ();
12506 }
12507
12508 /* Emit instruction sequence to compute either the approximate square root
12509    or its approximate reciprocal, depending on the flag RECP, and return
12510    whether the sequence was emitted or not.  */
12511
12512 bool
12513 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
12514 {
12515   machine_mode mode = GET_MODE (dst);
12516
12517   if (GET_MODE_INNER (mode) == HFmode)
12518     {
12519       gcc_assert (!recp);
12520       return false;
12521     }
12522
12523   if (!recp)
12524     {
12525       if (!(flag_mlow_precision_sqrt
12526             || (aarch64_tune_params.approx_modes->sqrt
12527                 & AARCH64_APPROX_MODE (mode))))
12528         return false;
12529
12530       if (flag_finite_math_only
12531           || flag_trapping_math
12532           || !flag_unsafe_math_optimizations
12533           || optimize_function_for_size_p (cfun))
12534         return false;
12535     }
12536   else
12537     /* Caller assumes we cannot fail.  */
12538     gcc_assert (use_rsqrt_p (mode));
12539
12540   machine_mode mmsk = (VECTOR_MODE_P (mode)
12541                        ? mode_for_int_vector (mode).require ()
12542                        : int_mode_for_mode (mode).require ());
12543   rtx xmsk = gen_reg_rtx (mmsk);
12544   if (!recp)
12545     /* When calculating the approximate square root, compare the
12546        argument with 0.0 and create a mask.  */
12547     emit_insn (gen_rtx_SET (xmsk,
12548                             gen_rtx_NEG (mmsk,
12549                                          gen_rtx_EQ (mmsk, src,
12550                                                      CONST0_RTX (mode)))));
12551
12552   /* Estimate the approximate reciprocal square root.  */
12553   rtx xdst = gen_reg_rtx (mode);
12554   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
12555
12556   /* Iterate over the series twice for SF and thrice for DF.  */
12557   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
12558
12559   /* Optionally iterate over the series once less for faster performance
12560      while sacrificing the accuracy.  */
12561   if ((recp && flag_mrecip_low_precision_sqrt)
12562       || (!recp && flag_mlow_precision_sqrt))
12563     iterations--;
12564
12565   /* Iterate over the series to calculate the approximate reciprocal square
12566      root.  */
12567   rtx x1 = gen_reg_rtx (mode);
12568   while (iterations--)
12569     {
12570       rtx x2 = gen_reg_rtx (mode);
12571       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
12572
12573       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
12574
12575       if (iterations > 0)
12576         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
12577     }
12578
12579   if (!recp)
12580     {
12581       /* Qualify the approximate reciprocal square root when the argument is
12582          0.0 by squashing the intermediary result to 0.0.  */
12583       rtx xtmp = gen_reg_rtx (mmsk);
12584       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
12585                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
12586       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
12587
12588       /* Calculate the approximate square root.  */
12589       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
12590     }
12591
12592   /* Finalize the approximation.  */
12593   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
12594
12595   return true;
12596 }
12597
12598 /* Emit the instruction sequence to compute the approximation for the division
12599    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
12600
12601 bool
12602 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
12603 {
12604   machine_mode mode = GET_MODE (quo);
12605
12606   if (GET_MODE_INNER (mode) == HFmode)
12607     return false;
12608
12609   bool use_approx_division_p = (flag_mlow_precision_div
12610                                 || (aarch64_tune_params.approx_modes->division
12611                                     & AARCH64_APPROX_MODE (mode)));
12612
12613   if (!flag_finite_math_only
12614       || flag_trapping_math
12615       || !flag_unsafe_math_optimizations
12616       || optimize_function_for_size_p (cfun)
12617       || !use_approx_division_p)
12618     return false;
12619
12620   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
12621     return false;
12622
12623   /* Estimate the approximate reciprocal.  */
12624   rtx xrcp = gen_reg_rtx (mode);
12625   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
12626
12627   /* Iterate over the series twice for SF and thrice for DF.  */
12628   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
12629
12630   /* Optionally iterate over the series once less for faster performance,
12631      while sacrificing the accuracy.  */
12632   if (flag_mlow_precision_div)
12633     iterations--;
12634
12635   /* Iterate over the series to calculate the approximate reciprocal.  */
12636   rtx xtmp = gen_reg_rtx (mode);
12637   while (iterations--)
12638     {
12639       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
12640
12641       if (iterations > 0)
12642         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
12643     }
12644
12645   if (num != CONST1_RTX (mode))
12646     {
12647       /* As the approximate reciprocal of DEN is already calculated, only
12648          calculate the approximate division when NUM is not 1.0.  */
12649       rtx xnum = force_reg (mode, num);
12650       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
12651     }
12652
12653   /* Finalize the approximation.  */
12654   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
12655   return true;
12656 }
12657
12658 /* Return the number of instructions that can be issued per cycle.  */
12659 static int
12660 aarch64_sched_issue_rate (void)
12661 {
12662   return aarch64_tune_params.issue_rate;
12663 }
12664
12665 /* Implement TARGET_SCHED_VARIABLE_ISSUE.  */
12666 static int
12667 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
12668 {
12669   if (DEBUG_INSN_P (insn))
12670     return more;
12671
12672   rtx_code code = GET_CODE (PATTERN (insn));
12673   if (code == USE || code == CLOBBER)
12674     return more;
12675
12676   if (get_attr_type (insn) == TYPE_NO_INSN)
12677     return more;
12678
12679   return more - 1;
12680 }
12681
12682 static int
12683 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
12684 {
12685   int issue_rate = aarch64_sched_issue_rate ();
12686
12687   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
12688 }
12689
12690
12691 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
12692    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
12693    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
12694
12695 static int
12696 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
12697                                                     int ready_index)
12698 {
12699   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
12700 }
12701
12702
12703 /* Vectorizer cost model target hooks.  */
12704
12705 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
12706 static int
12707 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
12708                                     tree vectype,
12709                                     int misalign ATTRIBUTE_UNUSED)
12710 {
12711   unsigned elements;
12712   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
12713   bool fp = false;
12714
12715   if (vectype != NULL)
12716     fp = FLOAT_TYPE_P (vectype);
12717
12718   switch (type_of_cost)
12719     {
12720       case scalar_stmt:
12721         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
12722
12723       case scalar_load:
12724         return costs->scalar_load_cost;
12725
12726       case scalar_store:
12727         return costs->scalar_store_cost;
12728
12729       case vector_stmt:
12730         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
12731
12732       case vector_load:
12733         return costs->vec_align_load_cost;
12734
12735       case vector_store:
12736         return costs->vec_store_cost;
12737
12738       case vec_to_scalar:
12739         return costs->vec_to_scalar_cost;
12740
12741       case scalar_to_vec:
12742         return costs->scalar_to_vec_cost;
12743
12744       case unaligned_load:
12745       case vector_gather_load:
12746         return costs->vec_unalign_load_cost;
12747
12748       case unaligned_store:
12749       case vector_scatter_store:
12750         return costs->vec_unalign_store_cost;
12751
12752       case cond_branch_taken:
12753         return costs->cond_taken_branch_cost;
12754
12755       case cond_branch_not_taken:
12756         return costs->cond_not_taken_branch_cost;
12757
12758       case vec_perm:
12759         return costs->vec_permute_cost;
12760
12761       case vec_promote_demote:
12762         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
12763
12764       case vec_construct:
12765         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
12766         return elements / 2 + 1;
12767
12768       default:
12769         gcc_unreachable ();
12770     }
12771 }
12772
12773 /* Implement targetm.vectorize.add_stmt_cost.  */
12774 static unsigned
12775 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
12776                        struct _stmt_vec_info *stmt_info, int misalign,
12777                        enum vect_cost_model_location where)
12778 {
12779   unsigned *cost = (unsigned *) data;
12780   unsigned retval = 0;
12781
12782   if (flag_vect_cost_model)
12783     {
12784       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
12785       int stmt_cost =
12786             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
12787
12788       /* Statements in an inner loop relative to the loop being
12789          vectorized are weighted more heavily.  The value here is
12790          arbitrary and could potentially be improved with analysis.  */
12791       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
12792         count *= 50; /*  FIXME  */
12793
12794       retval = (unsigned) (count * stmt_cost);
12795       cost[where] += retval;
12796     }
12797
12798   return retval;
12799 }
12800
12801 static void initialize_aarch64_code_model (struct gcc_options *);
12802
12803 /* Parse the TO_PARSE string and put the architecture struct that it
12804    selects into RES and the architectural features into ISA_FLAGS.
12805    Return an aarch64_parse_opt_result describing the parse result.
12806    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
12807    When the TO_PARSE string contains an invalid extension,
12808    a copy of the string is created and stored to INVALID_EXTENSION.  */
12809
12810 static enum aarch64_parse_opt_result
12811 aarch64_parse_arch (const char *to_parse, const struct processor **res,
12812                     uint64_t *isa_flags, std::string *invalid_extension)
12813 {
12814   const char *ext;
12815   const struct processor *arch;
12816   size_t len;
12817
12818   ext = strchr (to_parse, '+');
12819
12820   if (ext != NULL)
12821     len = ext - to_parse;
12822   else
12823     len = strlen (to_parse);
12824
12825   if (len == 0)
12826     return AARCH64_PARSE_MISSING_ARG;
12827
12828
12829   /* Loop through the list of supported ARCHes to find a match.  */
12830   for (arch = all_architectures; arch->name != NULL; arch++)
12831     {
12832       if (strlen (arch->name) == len
12833           && strncmp (arch->name, to_parse, len) == 0)
12834         {
12835           uint64_t isa_temp = arch->flags;
12836
12837           if (ext != NULL)
12838             {
12839               /* TO_PARSE string contains at least one extension.  */
12840               enum aarch64_parse_opt_result ext_res
12841                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
12842
12843               if (ext_res != AARCH64_PARSE_OK)
12844                 return ext_res;
12845             }
12846           /* Extension parsing was successful.  Confirm the result
12847              arch and ISA flags.  */
12848           *res = arch;
12849           *isa_flags = isa_temp;
12850           return AARCH64_PARSE_OK;
12851         }
12852     }
12853
12854   /* ARCH name not found in list.  */
12855   return AARCH64_PARSE_INVALID_ARG;
12856 }
12857
12858 /* Parse the TO_PARSE string and put the result tuning in RES and the
12859    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
12860    describing the parse result.  If there is an error parsing, RES and
12861    ISA_FLAGS are left unchanged.
12862    When the TO_PARSE string contains an invalid extension,
12863    a copy of the string is created and stored to INVALID_EXTENSION.  */
12864
12865 static enum aarch64_parse_opt_result
12866 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
12867                    uint64_t *isa_flags, std::string *invalid_extension)
12868 {
12869   const char *ext;
12870   const struct processor *cpu;
12871   size_t len;
12872
12873   ext = strchr (to_parse, '+');
12874
12875   if (ext != NULL)
12876     len = ext - to_parse;
12877   else
12878     len = strlen (to_parse);
12879
12880   if (len == 0)
12881     return AARCH64_PARSE_MISSING_ARG;
12882
12883
12884   /* Loop through the list of supported CPUs to find a match.  */
12885   for (cpu = all_cores; cpu->name != NULL; cpu++)
12886     {
12887       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
12888         {
12889           uint64_t isa_temp = cpu->flags;
12890
12891
12892           if (ext != NULL)
12893             {
12894               /* TO_PARSE string contains at least one extension.  */
12895               enum aarch64_parse_opt_result ext_res
12896                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
12897
12898               if (ext_res != AARCH64_PARSE_OK)
12899                 return ext_res;
12900             }
12901           /* Extension parsing was successfull.  Confirm the result
12902              cpu and ISA flags.  */
12903           *res = cpu;
12904           *isa_flags = isa_temp;
12905           return AARCH64_PARSE_OK;
12906         }
12907     }
12908
12909   /* CPU name not found in list.  */
12910   return AARCH64_PARSE_INVALID_ARG;
12911 }
12912
12913 /* Parse the TO_PARSE string and put the cpu it selects into RES.
12914    Return an aarch64_parse_opt_result describing the parse result.
12915    If the parsing fails the RES does not change.  */
12916
12917 static enum aarch64_parse_opt_result
12918 aarch64_parse_tune (const char *to_parse, const struct processor **res)
12919 {
12920   const struct processor *cpu;
12921
12922   /* Loop through the list of supported CPUs to find a match.  */
12923   for (cpu = all_cores; cpu->name != NULL; cpu++)
12924     {
12925       if (strcmp (cpu->name, to_parse) == 0)
12926         {
12927           *res = cpu;
12928           return AARCH64_PARSE_OK;
12929         }
12930     }
12931
12932   /* CPU name not found in list.  */
12933   return AARCH64_PARSE_INVALID_ARG;
12934 }
12935
12936 /* Parse TOKEN, which has length LENGTH to see if it is an option
12937    described in FLAG.  If it is, return the index bit for that fusion type.
12938    If not, error (printing OPTION_NAME) and return zero.  */
12939
12940 static unsigned int
12941 aarch64_parse_one_option_token (const char *token,
12942                                 size_t length,
12943                                 const struct aarch64_flag_desc *flag,
12944                                 const char *option_name)
12945 {
12946   for (; flag->name != NULL; flag++)
12947     {
12948       if (length == strlen (flag->name)
12949           && !strncmp (flag->name, token, length))
12950         return flag->flag;
12951     }
12952
12953   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
12954   return 0;
12955 }
12956
12957 /* Parse OPTION which is a comma-separated list of flags to enable.
12958    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
12959    default state we inherit from the CPU tuning structures.  OPTION_NAME
12960    gives the top-level option we are parsing in the -moverride string,
12961    for use in error messages.  */
12962
12963 static unsigned int
12964 aarch64_parse_boolean_options (const char *option,
12965                                const struct aarch64_flag_desc *flags,
12966                                unsigned int initial_state,
12967                                const char *option_name)
12968 {
12969   const char separator = '.';
12970   const char* specs = option;
12971   const char* ntoken = option;
12972   unsigned int found_flags = initial_state;
12973
12974   while ((ntoken = strchr (specs, separator)))
12975     {
12976       size_t token_length = ntoken - specs;
12977       unsigned token_ops = aarch64_parse_one_option_token (specs,
12978                                                            token_length,
12979                                                            flags,
12980                                                            option_name);
12981       /* If we find "none" (or, for simplicity's sake, an error) anywhere
12982          in the token stream, reset the supported operations.  So:
12983
12984            adrp+add.cmp+branch.none.adrp+add
12985
12986            would have the result of turning on only adrp+add fusion.  */
12987       if (!token_ops)
12988         found_flags = 0;
12989
12990       found_flags |= token_ops;
12991       specs = ++ntoken;
12992     }
12993
12994   /* We ended with a comma, print something.  */
12995   if (!(*specs))
12996     {
12997       error ("%s string ill-formed\n", option_name);
12998       return 0;
12999     }
13000
13001   /* We still have one more token to parse.  */
13002   size_t token_length = strlen (specs);
13003   unsigned token_ops = aarch64_parse_one_option_token (specs,
13004                                                        token_length,
13005                                                        flags,
13006                                                        option_name);
13007    if (!token_ops)
13008      found_flags = 0;
13009
13010   found_flags |= token_ops;
13011   return found_flags;
13012 }
13013
13014 /* Support for overriding instruction fusion.  */
13015
13016 static void
13017 aarch64_parse_fuse_string (const char *fuse_string,
13018                             struct tune_params *tune)
13019 {
13020   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
13021                                                      aarch64_fusible_pairs,
13022                                                      tune->fusible_ops,
13023                                                      "fuse=");
13024 }
13025
13026 /* Support for overriding other tuning flags.  */
13027
13028 static void
13029 aarch64_parse_tune_string (const char *tune_string,
13030                             struct tune_params *tune)
13031 {
13032   tune->extra_tuning_flags
13033     = aarch64_parse_boolean_options (tune_string,
13034                                      aarch64_tuning_flags,
13035                                      tune->extra_tuning_flags,
13036                                      "tune=");
13037 }
13038
13039 /* Parse the sve_width tuning moverride string in TUNE_STRING.
13040    Accept the valid SVE vector widths allowed by
13041    aarch64_sve_vector_bits_enum and use it to override sve_width
13042    in TUNE.  */
13043
13044 static void
13045 aarch64_parse_sve_width_string (const char *tune_string,
13046                                 struct tune_params *tune)
13047 {
13048   int width = -1;
13049
13050   int n = sscanf (tune_string, "%d", &width);
13051   if (n == EOF)
13052     {
13053       error ("invalid format for sve_width");
13054       return;
13055     }
13056   switch (width)
13057     {
13058     case SVE_128:
13059     case SVE_256:
13060     case SVE_512:
13061     case SVE_1024:
13062     case SVE_2048:
13063       break;
13064     default:
13065       error ("invalid sve_width value: %d", width);
13066     }
13067   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
13068 }
13069
13070 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
13071    we understand.  If it is, extract the option string and handoff to
13072    the appropriate function.  */
13073
13074 void
13075 aarch64_parse_one_override_token (const char* token,
13076                                   size_t length,
13077                                   struct tune_params *tune)
13078 {
13079   const struct aarch64_tuning_override_function *fn
13080     = aarch64_tuning_override_functions;
13081
13082   const char *option_part = strchr (token, '=');
13083   if (!option_part)
13084     {
13085       error ("tuning string missing in option (%s)", token);
13086       return;
13087     }
13088
13089   /* Get the length of the option name.  */
13090   length = option_part - token;
13091   /* Skip the '=' to get to the option string.  */
13092   option_part++;
13093
13094   for (; fn->name != NULL; fn++)
13095     {
13096       if (!strncmp (fn->name, token, length))
13097         {
13098           fn->parse_override (option_part, tune);
13099           return;
13100         }
13101     }
13102
13103   error ("unknown tuning option (%s)",token);
13104   return;
13105 }
13106
13107 /* A checking mechanism for the implementation of the tls size.  */
13108
13109 static void
13110 initialize_aarch64_tls_size (struct gcc_options *opts)
13111 {
13112   if (aarch64_tls_size == 0)
13113     aarch64_tls_size = 24;
13114
13115   switch (opts->x_aarch64_cmodel_var)
13116     {
13117     case AARCH64_CMODEL_TINY:
13118       /* Both the default and maximum TLS size allowed under tiny is 1M which
13119          needs two instructions to address, so we clamp the size to 24.  */
13120       if (aarch64_tls_size > 24)
13121         aarch64_tls_size = 24;
13122       break;
13123     case AARCH64_CMODEL_SMALL:
13124       /* The maximum TLS size allowed under small is 4G.  */
13125       if (aarch64_tls_size > 32)
13126         aarch64_tls_size = 32;
13127       break;
13128     case AARCH64_CMODEL_LARGE:
13129       /* The maximum TLS size allowed under large is 16E.
13130          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
13131       if (aarch64_tls_size > 48)
13132         aarch64_tls_size = 48;
13133       break;
13134     default:
13135       gcc_unreachable ();
13136     }
13137
13138   return;
13139 }
13140
13141 /* Parse STRING looking for options in the format:
13142      string     :: option:string
13143      option     :: name=substring
13144      name       :: {a-z}
13145      substring  :: defined by option.  */
13146
13147 static void
13148 aarch64_parse_override_string (const char* input_string,
13149                                struct tune_params* tune)
13150 {
13151   const char separator = ':';
13152   size_t string_length = strlen (input_string) + 1;
13153   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
13154   char *string = string_root;
13155   strncpy (string, input_string, string_length);
13156   string[string_length - 1] = '\0';
13157
13158   char* ntoken = string;
13159
13160   while ((ntoken = strchr (string, separator)))
13161     {
13162       size_t token_length = ntoken - string;
13163       /* Make this substring look like a string.  */
13164       *ntoken = '\0';
13165       aarch64_parse_one_override_token (string, token_length, tune);
13166       string = ++ntoken;
13167     }
13168
13169   /* One last option to parse.  */
13170   aarch64_parse_one_override_token (string, strlen (string), tune);
13171   free (string_root);
13172 }
13173
13174
13175 static void
13176 aarch64_override_options_after_change_1 (struct gcc_options *opts)
13177 {
13178   if (accepted_branch_protection_string)
13179     {
13180       opts->x_aarch64_branch_protection_string
13181         = xstrdup (accepted_branch_protection_string);
13182     }
13183
13184   /* PR 70044: We have to be careful about being called multiple times for the
13185      same function.  This means all changes should be repeatable.  */
13186
13187   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
13188      Disable the frame pointer flag so the mid-end will not use a frame
13189      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
13190      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
13191      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
13192   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
13193   if (opts->x_flag_omit_frame_pointer == 0)
13194     opts->x_flag_omit_frame_pointer = 2;
13195
13196   /* If not optimizing for size, set the default
13197      alignment to what the target wants.  */
13198   if (!opts->x_optimize_size)
13199     {
13200       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
13201         opts->x_str_align_loops = aarch64_tune_params.loop_align;
13202       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
13203         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
13204       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
13205         opts->x_str_align_functions = aarch64_tune_params.function_align;
13206     }
13207
13208   /* We default to no pc-relative literal loads.  */
13209
13210   aarch64_pcrelative_literal_loads = false;
13211
13212   /* If -mpc-relative-literal-loads is set on the command line, this
13213      implies that the user asked for PC relative literal loads.  */
13214   if (opts->x_pcrelative_literal_loads == 1)
13215     aarch64_pcrelative_literal_loads = true;
13216
13217   /* In the tiny memory model it makes no sense to disallow PC relative
13218      literal pool loads.  */
13219   if (aarch64_cmodel == AARCH64_CMODEL_TINY
13220       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
13221     aarch64_pcrelative_literal_loads = true;
13222
13223   /* When enabling the lower precision Newton series for the square root, also
13224      enable it for the reciprocal square root, since the latter is an
13225      intermediary step for the former.  */
13226   if (flag_mlow_precision_sqrt)
13227     flag_mrecip_low_precision_sqrt = true;
13228 }
13229
13230 /* 'Unpack' up the internal tuning structs and update the options
13231     in OPTS.  The caller must have set up selected_tune and selected_arch
13232     as all the other target-specific codegen decisions are
13233     derived from them.  */
13234
13235 void
13236 aarch64_override_options_internal (struct gcc_options *opts)
13237 {
13238   aarch64_tune_flags = selected_tune->flags;
13239   aarch64_tune = selected_tune->sched_core;
13240   /* Make a copy of the tuning parameters attached to the core, which
13241      we may later overwrite.  */
13242   aarch64_tune_params = *(selected_tune->tune);
13243   aarch64_architecture_version = selected_arch->architecture_version;
13244
13245   if (opts->x_aarch64_override_tune_string)
13246     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
13247                                   &aarch64_tune_params);
13248
13249   /* This target defaults to strict volatile bitfields.  */
13250   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
13251     opts->x_flag_strict_volatile_bitfields = 1;
13252
13253   if (aarch64_stack_protector_guard == SSP_GLOBAL
13254       && opts->x_aarch64_stack_protector_guard_offset_str)
13255     {
13256       error ("incompatible options %<-mstack-protector-guard=global%> and "
13257              "%<-mstack-protector-guard-offset=%s%>",
13258              aarch64_stack_protector_guard_offset_str);
13259     }
13260
13261   if (aarch64_stack_protector_guard == SSP_SYSREG
13262       && !(opts->x_aarch64_stack_protector_guard_offset_str
13263            && opts->x_aarch64_stack_protector_guard_reg_str))
13264     {
13265       error ("both %<-mstack-protector-guard-offset%> and "
13266              "%<-mstack-protector-guard-reg%> must be used "
13267              "with %<-mstack-protector-guard=sysreg%>");
13268     }
13269
13270   if (opts->x_aarch64_stack_protector_guard_reg_str)
13271     {
13272       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
13273           error ("specify a system register with a small string length.");
13274     }
13275
13276   if (opts->x_aarch64_stack_protector_guard_offset_str)
13277     {
13278       char *end;
13279       const char *str = aarch64_stack_protector_guard_offset_str;
13280       errno = 0;
13281       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
13282       if (!*str || *end || errno)
13283         error ("%qs is not a valid offset in %qs", str,
13284                "-mstack-protector-guard-offset=");
13285       aarch64_stack_protector_guard_offset = offs;
13286     }
13287
13288   initialize_aarch64_code_model (opts);
13289   initialize_aarch64_tls_size (opts);
13290
13291   int queue_depth = 0;
13292   switch (aarch64_tune_params.autoprefetcher_model)
13293     {
13294       case tune_params::AUTOPREFETCHER_OFF:
13295         queue_depth = -1;
13296         break;
13297       case tune_params::AUTOPREFETCHER_WEAK:
13298         queue_depth = 0;
13299         break;
13300       case tune_params::AUTOPREFETCHER_STRONG:
13301         queue_depth = max_insn_queue_index + 1;
13302         break;
13303       default:
13304         gcc_unreachable ();
13305     }
13306
13307   /* We don't mind passing in global_options_set here as we don't use
13308      the *options_set structs anyway.  */
13309   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
13310                          queue_depth,
13311                          opts->x_param_values,
13312                          global_options_set.x_param_values);
13313
13314   /* Set up parameters to be used in prefetching algorithm.  Do not
13315      override the defaults unless we are tuning for a core we have
13316      researched values for.  */
13317   if (aarch64_tune_params.prefetch->num_slots > 0)
13318     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
13319                            aarch64_tune_params.prefetch->num_slots,
13320                            opts->x_param_values,
13321                            global_options_set.x_param_values);
13322   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
13323     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
13324                            aarch64_tune_params.prefetch->l1_cache_size,
13325                            opts->x_param_values,
13326                            global_options_set.x_param_values);
13327   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
13328     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
13329                            aarch64_tune_params.prefetch->l1_cache_line_size,
13330                            opts->x_param_values,
13331                            global_options_set.x_param_values);
13332   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
13333     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
13334                            aarch64_tune_params.prefetch->l2_cache_size,
13335                            opts->x_param_values,
13336                            global_options_set.x_param_values);
13337   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
13338     maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
13339                            0,
13340                            opts->x_param_values,
13341                            global_options_set.x_param_values);
13342   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
13343     maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
13344                            aarch64_tune_params.prefetch->minimum_stride,
13345                            opts->x_param_values,
13346                            global_options_set.x_param_values);
13347
13348   /* Use the alternative scheduling-pressure algorithm by default.  */
13349   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
13350                          opts->x_param_values,
13351                          global_options_set.x_param_values);
13352
13353   /* If the user hasn't changed it via configure then set the default to 64 KB
13354      for the backend.  */
13355   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
13356                          DEFAULT_STK_CLASH_GUARD_SIZE == 0
13357                            ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
13358                          opts->x_param_values,
13359                          global_options_set.x_param_values);
13360
13361   /* Validate the guard size.  */
13362   int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
13363
13364   /* Enforce that interval is the same size as size so the mid-end does the
13365      right thing.  */
13366   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
13367                          guard_size,
13368                          opts->x_param_values,
13369                          global_options_set.x_param_values);
13370
13371   /* The maybe_set calls won't update the value if the user has explicitly set
13372      one.  Which means we need to validate that probing interval and guard size
13373      are equal.  */
13374   int probe_interval
13375     = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
13376   if (guard_size != probe_interval)
13377     error ("stack clash guard size %<%d%> must be equal to probing interval "
13378            "%<%d%>", guard_size, probe_interval);
13379
13380   /* Enable sw prefetching at specified optimization level for
13381      CPUS that have prefetch.  Lower optimization level threshold by 1
13382      when profiling is enabled.  */
13383   if (opts->x_flag_prefetch_loop_arrays < 0
13384       && !opts->x_optimize_size
13385       && aarch64_tune_params.prefetch->default_opt_level >= 0
13386       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
13387     opts->x_flag_prefetch_loop_arrays = 1;
13388
13389   if (opts->x_aarch64_arch_string == NULL)
13390     opts->x_aarch64_arch_string = selected_arch->name;
13391   if (opts->x_aarch64_cpu_string == NULL)
13392     opts->x_aarch64_cpu_string = selected_cpu->name;
13393   if (opts->x_aarch64_tune_string == NULL)
13394     opts->x_aarch64_tune_string = selected_tune->name;
13395
13396   aarch64_override_options_after_change_1 (opts);
13397 }
13398
13399 /* Print a hint with a suggestion for a core or architecture name that
13400    most closely resembles what the user passed in STR.  ARCH is true if
13401    the user is asking for an architecture name.  ARCH is false if the user
13402    is asking for a core name.  */
13403
13404 static void
13405 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
13406 {
13407   auto_vec<const char *> candidates;
13408   const struct processor *entry = arch ? all_architectures : all_cores;
13409   for (; entry->name != NULL; entry++)
13410     candidates.safe_push (entry->name);
13411
13412 #ifdef HAVE_LOCAL_CPU_DETECT
13413   /* Add also "native" as possible value.  */
13414   if (arch)
13415     candidates.safe_push ("native");
13416 #endif
13417
13418   char *s;
13419   const char *hint = candidates_list_and_hint (str, s, candidates);
13420   if (hint)
13421     inform (input_location, "valid arguments are: %s;"
13422                              " did you mean %qs?", s, hint);
13423   else
13424     inform (input_location, "valid arguments are: %s", s);
13425
13426   XDELETEVEC (s);
13427 }
13428
13429 /* Print a hint with a suggestion for a core name that most closely resembles
13430    what the user passed in STR.  */
13431
13432 inline static void
13433 aarch64_print_hint_for_core (const char *str)
13434 {
13435   aarch64_print_hint_for_core_or_arch (str, false);
13436 }
13437
13438 /* Print a hint with a suggestion for an architecture name that most closely
13439    resembles what the user passed in STR.  */
13440
13441 inline static void
13442 aarch64_print_hint_for_arch (const char *str)
13443 {
13444   aarch64_print_hint_for_core_or_arch (str, true);
13445 }
13446
13447
13448 /* Print a hint with a suggestion for an extension name
13449    that most closely resembles what the user passed in STR.  */
13450
13451 void
13452 aarch64_print_hint_for_extensions (const std::string &str)
13453 {
13454   auto_vec<const char *> candidates;
13455   aarch64_get_all_extension_candidates (&candidates);
13456   char *s;
13457   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
13458   if (hint)
13459     inform (input_location, "valid arguments are: %s;"
13460                              " did you mean %qs?", s, hint);
13461   else
13462     inform (input_location, "valid arguments are: %s;", s);
13463
13464   XDELETEVEC (s);
13465 }
13466
13467 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
13468    specified in STR and throw errors if appropriate.  Put the results if
13469    they are valid in RES and ISA_FLAGS.  Return whether the option is
13470    valid.  */
13471
13472 static bool
13473 aarch64_validate_mcpu (const char *str, const struct processor **res,
13474                        uint64_t *isa_flags)
13475 {
13476   std::string invalid_extension;
13477   enum aarch64_parse_opt_result parse_res
13478     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
13479
13480   if (parse_res == AARCH64_PARSE_OK)
13481     return true;
13482
13483   switch (parse_res)
13484     {
13485       case AARCH64_PARSE_MISSING_ARG:
13486         error ("missing cpu name in %<-mcpu=%s%>", str);
13487         break;
13488       case AARCH64_PARSE_INVALID_ARG:
13489         error ("unknown value %qs for %<-mcpu%>", str);
13490         aarch64_print_hint_for_core (str);
13491         break;
13492       case AARCH64_PARSE_INVALID_FEATURE:
13493         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
13494                invalid_extension.c_str (), str);
13495         aarch64_print_hint_for_extensions (invalid_extension);
13496         break;
13497       default:
13498         gcc_unreachable ();
13499     }
13500
13501   return false;
13502 }
13503
13504 /* Parses CONST_STR for branch protection features specified in
13505    aarch64_branch_protect_types, and set any global variables required.  Returns
13506    the parsing result and assigns LAST_STR to the last processed token from
13507    CONST_STR so that it can be used for error reporting.  */
13508
13509 static enum
13510 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
13511                                                           char** last_str)
13512 {
13513   char *str_root = xstrdup (const_str);
13514   char* token_save = NULL;
13515   char *str = strtok_r (str_root, "+", &token_save);
13516   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
13517   if (!str)
13518     res = AARCH64_PARSE_MISSING_ARG;
13519   else
13520     {
13521       char *next_str = strtok_r (NULL, "+", &token_save);
13522       /* Reset the branch protection features to their defaults.  */
13523       aarch64_handle_no_branch_protection (NULL, NULL);
13524
13525       while (str && res == AARCH64_PARSE_OK)
13526         {
13527           const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
13528           bool found = false;
13529           /* Search for this type.  */
13530           while (type && type->name && !found && res == AARCH64_PARSE_OK)
13531             {
13532               if (strcmp (str, type->name) == 0)
13533                 {
13534                   found = true;
13535                   res = type->handler (str, next_str);
13536                   str = next_str;
13537                   next_str = strtok_r (NULL, "+", &token_save);
13538                 }
13539               else
13540                 type++;
13541             }
13542           if (found && res == AARCH64_PARSE_OK)
13543             {
13544               bool found_subtype = true;
13545               /* Loop through each token until we find one that isn't a
13546                  subtype.  */
13547               while (found_subtype)
13548                 {
13549                   found_subtype = false;
13550                   const aarch64_branch_protect_type *subtype = type->subtypes;
13551                   /* Search for the subtype.  */
13552                   while (str && subtype && subtype->name && !found_subtype
13553                           && res == AARCH64_PARSE_OK)
13554                     {
13555                       if (strcmp (str, subtype->name) == 0)
13556                         {
13557                           found_subtype = true;
13558                           res = subtype->handler (str, next_str);
13559                           str = next_str;
13560                           next_str = strtok_r (NULL, "+", &token_save);
13561                         }
13562                       else
13563                         subtype++;
13564                     }
13565                 }
13566             }
13567           else if (!found)
13568             res = AARCH64_PARSE_INVALID_ARG;
13569         }
13570     }
13571   /* Copy the last processed token into the argument to pass it back.
13572     Used by option and attribute validation to print the offending token.  */
13573   if (last_str)
13574     {
13575       if (str) strcpy (*last_str, str);
13576       else *last_str = NULL;
13577     }
13578   if (res == AARCH64_PARSE_OK)
13579     {
13580       /* If needed, alloc the accepted string then copy in const_str.
13581         Used by override_option_after_change_1.  */
13582       if (!accepted_branch_protection_string)
13583         accepted_branch_protection_string = (char *) xmalloc (
13584                                                       BRANCH_PROTECT_STR_MAX
13585                                                         + 1);
13586       strncpy (accepted_branch_protection_string, const_str,
13587                 BRANCH_PROTECT_STR_MAX + 1);
13588       /* Forcibly null-terminate.  */
13589       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
13590     }
13591   return res;
13592 }
13593
13594 static bool
13595 aarch64_validate_mbranch_protection (const char *const_str)
13596 {
13597   char *str = (char *) xmalloc (strlen (const_str));
13598   enum aarch64_parse_opt_result res =
13599     aarch64_parse_branch_protection (const_str, &str);
13600   if (res == AARCH64_PARSE_INVALID_ARG)
13601     error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
13602   else if (res == AARCH64_PARSE_MISSING_ARG)
13603     error ("missing argument for %<-mbranch-protection=%>");
13604   free (str);
13605   return res == AARCH64_PARSE_OK;
13606 }
13607
13608 /* Validate a command-line -march option.  Parse the arch and extensions
13609    (if any) specified in STR and throw errors if appropriate.  Put the
13610    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
13611    option is valid.  */
13612
13613 static bool
13614 aarch64_validate_march (const char *str, const struct processor **res,
13615                          uint64_t *isa_flags)
13616 {
13617   std::string invalid_extension;
13618   enum aarch64_parse_opt_result parse_res
13619     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
13620
13621   if (parse_res == AARCH64_PARSE_OK)
13622     return true;
13623
13624   switch (parse_res)
13625     {
13626       case AARCH64_PARSE_MISSING_ARG:
13627         error ("missing arch name in %<-march=%s%>", str);
13628         break;
13629       case AARCH64_PARSE_INVALID_ARG:
13630         error ("unknown value %qs for %<-march%>", str);
13631         aarch64_print_hint_for_arch (str);
13632         break;
13633       case AARCH64_PARSE_INVALID_FEATURE:
13634         error ("invalid feature modifier %qs in %<-march=%s%>",
13635                invalid_extension.c_str (), str);
13636         aarch64_print_hint_for_extensions (invalid_extension);
13637         break;
13638       default:
13639         gcc_unreachable ();
13640     }
13641
13642   return false;
13643 }
13644
13645 /* Validate a command-line -mtune option.  Parse the cpu
13646    specified in STR and throw errors if appropriate.  Put the
13647    result, if it is valid, in RES.  Return whether the option is
13648    valid.  */
13649
13650 static bool
13651 aarch64_validate_mtune (const char *str, const struct processor **res)
13652 {
13653   enum aarch64_parse_opt_result parse_res
13654     = aarch64_parse_tune (str, res);
13655
13656   if (parse_res == AARCH64_PARSE_OK)
13657     return true;
13658
13659   switch (parse_res)
13660     {
13661       case AARCH64_PARSE_MISSING_ARG:
13662         error ("missing cpu name in %<-mtune=%s%>", str);
13663         break;
13664       case AARCH64_PARSE_INVALID_ARG:
13665         error ("unknown value %qs for %<-mtune%>", str);
13666         aarch64_print_hint_for_core (str);
13667         break;
13668       default:
13669         gcc_unreachable ();
13670     }
13671   return false;
13672 }
13673
13674 /* Return the CPU corresponding to the enum CPU.
13675    If it doesn't specify a cpu, return the default.  */
13676
13677 static const struct processor *
13678 aarch64_get_tune_cpu (enum aarch64_processor cpu)
13679 {
13680   if (cpu != aarch64_none)
13681     return &all_cores[cpu];
13682
13683   /* The & 0x3f is to extract the bottom 6 bits that encode the
13684      default cpu as selected by the --with-cpu GCC configure option
13685      in config.gcc.
13686      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
13687      flags mechanism should be reworked to make it more sane.  */
13688   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
13689 }
13690
13691 /* Return the architecture corresponding to the enum ARCH.
13692    If it doesn't specify a valid architecture, return the default.  */
13693
13694 static const struct processor *
13695 aarch64_get_arch (enum aarch64_arch arch)
13696 {
13697   if (arch != aarch64_no_arch)
13698     return &all_architectures[arch];
13699
13700   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
13701
13702   return &all_architectures[cpu->arch];
13703 }
13704
13705 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
13706
13707 static poly_uint16
13708 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
13709 {
13710   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
13711      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
13712      deciding which .md file patterns to use and when deciding whether
13713      something is a legitimate address or constant.  */
13714   if (value == SVE_SCALABLE || value == SVE_128)
13715     return poly_uint16 (2, 2);
13716   else
13717     return (int) value / 64;
13718 }
13719
13720 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
13721    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
13722    tuning structs.  In particular it must set selected_tune and
13723    aarch64_isa_flags that define the available ISA features and tuning
13724    decisions.  It must also set selected_arch as this will be used to
13725    output the .arch asm tags for each function.  */
13726
13727 static void
13728 aarch64_override_options (void)
13729 {
13730   uint64_t cpu_isa = 0;
13731   uint64_t arch_isa = 0;
13732   aarch64_isa_flags = 0;
13733
13734   bool valid_cpu = true;
13735   bool valid_tune = true;
13736   bool valid_arch = true;
13737
13738   selected_cpu = NULL;
13739   selected_arch = NULL;
13740   selected_tune = NULL;
13741
13742   if (aarch64_branch_protection_string)
13743     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
13744
13745   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
13746      If either of -march or -mtune is given, they override their
13747      respective component of -mcpu.  */
13748   if (aarch64_cpu_string)
13749     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
13750                                         &cpu_isa);
13751
13752   if (aarch64_arch_string)
13753     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
13754                                           &arch_isa);
13755
13756   if (aarch64_tune_string)
13757     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
13758
13759 #ifdef SUBTARGET_OVERRIDE_OPTIONS
13760   SUBTARGET_OVERRIDE_OPTIONS;
13761 #endif
13762
13763   /* If the user did not specify a processor, choose the default
13764      one for them.  This will be the CPU set during configuration using
13765      --with-cpu, otherwise it is "generic".  */
13766   if (!selected_cpu)
13767     {
13768       if (selected_arch)
13769         {
13770           selected_cpu = &all_cores[selected_arch->ident];
13771           aarch64_isa_flags = arch_isa;
13772           explicit_arch = selected_arch->arch;
13773         }
13774       else
13775         {
13776           /* Get default configure-time CPU.  */
13777           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
13778           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
13779         }
13780
13781       if (selected_tune)
13782         explicit_tune_core = selected_tune->ident;
13783     }
13784   /* If both -mcpu and -march are specified check that they are architecturally
13785      compatible, warn if they're not and prefer the -march ISA flags.  */
13786   else if (selected_arch)
13787     {
13788       if (selected_arch->arch != selected_cpu->arch)
13789         {
13790           warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
13791                        all_architectures[selected_cpu->arch].name,
13792                        selected_arch->name);
13793         }
13794       aarch64_isa_flags = arch_isa;
13795       explicit_arch = selected_arch->arch;
13796       explicit_tune_core = selected_tune ? selected_tune->ident
13797                                           : selected_cpu->ident;
13798     }
13799   else
13800     {
13801       /* -mcpu but no -march.  */
13802       aarch64_isa_flags = cpu_isa;
13803       explicit_tune_core = selected_tune ? selected_tune->ident
13804                                           : selected_cpu->ident;
13805       gcc_assert (selected_cpu);
13806       selected_arch = &all_architectures[selected_cpu->arch];
13807       explicit_arch = selected_arch->arch;
13808     }
13809
13810   /* Set the arch as well as we will need it when outputing
13811      the .arch directive in assembly.  */
13812   if (!selected_arch)
13813     {
13814       gcc_assert (selected_cpu);
13815       selected_arch = &all_architectures[selected_cpu->arch];
13816     }
13817
13818   if (!selected_tune)
13819     selected_tune = selected_cpu;
13820
13821   if (aarch64_enable_bti == 2)
13822     {
13823 #ifdef TARGET_ENABLE_BTI
13824       aarch64_enable_bti = 1;
13825 #else
13826       aarch64_enable_bti = 0;
13827 #endif
13828     }
13829
13830   /* Return address signing is currently not supported for ILP32 targets.  For
13831      LP64 targets use the configured option in the absence of a command-line
13832      option for -mbranch-protection.  */
13833   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
13834     {
13835 #ifdef TARGET_ENABLE_PAC_RET
13836       aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
13837 #else
13838       aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
13839 #endif
13840     }
13841
13842 #ifndef HAVE_AS_MABI_OPTION
13843   /* The compiler may have been configured with 2.23.* binutils, which does
13844      not have support for ILP32.  */
13845   if (TARGET_ILP32)
13846     error ("assembler does not support %<-mabi=ilp32%>");
13847 #endif
13848
13849   /* Convert -msve-vector-bits to a VG count.  */
13850   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
13851
13852   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
13853     sorry ("return address signing is only supported for %<-mabi=lp64%>");
13854
13855   /* Make sure we properly set up the explicit options.  */
13856   if ((aarch64_cpu_string && valid_cpu)
13857        || (aarch64_tune_string && valid_tune))
13858     gcc_assert (explicit_tune_core != aarch64_none);
13859
13860   if ((aarch64_cpu_string && valid_cpu)
13861        || (aarch64_arch_string && valid_arch))
13862     gcc_assert (explicit_arch != aarch64_no_arch);
13863
13864   /* The pass to insert speculation tracking runs before
13865      shrink-wrapping and the latter does not know how to update the
13866      tracking status.  So disable it in this case.  */
13867   if (aarch64_track_speculation)
13868     flag_shrink_wrap = 0;
13869
13870   aarch64_override_options_internal (&global_options);
13871
13872   /* Save these options as the default ones in case we push and pop them later
13873      while processing functions with potential target attributes.  */
13874   target_option_default_node = target_option_current_node
13875       = build_target_option_node (&global_options);
13876 }
13877
13878 /* Implement targetm.override_options_after_change.  */
13879
13880 static void
13881 aarch64_override_options_after_change (void)
13882 {
13883   aarch64_override_options_after_change_1 (&global_options);
13884 }
13885
13886 static struct machine_function *
13887 aarch64_init_machine_status (void)
13888 {
13889   struct machine_function *machine;
13890   machine = ggc_cleared_alloc<machine_function> ();
13891   return machine;
13892 }
13893
13894 void
13895 aarch64_init_expanders (void)
13896 {
13897   init_machine_status = aarch64_init_machine_status;
13898 }
13899
13900 /* A checking mechanism for the implementation of the various code models.  */
13901 static void
13902 initialize_aarch64_code_model (struct gcc_options *opts)
13903 {
13904    if (opts->x_flag_pic)
13905      {
13906        switch (opts->x_aarch64_cmodel_var)
13907          {
13908          case AARCH64_CMODEL_TINY:
13909            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
13910            break;
13911          case AARCH64_CMODEL_SMALL:
13912 #ifdef HAVE_AS_SMALL_PIC_RELOCS
13913            aarch64_cmodel = (flag_pic == 2
13914                              ? AARCH64_CMODEL_SMALL_PIC
13915                              : AARCH64_CMODEL_SMALL_SPIC);
13916 #else
13917            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
13918 #endif
13919            break;
13920          case AARCH64_CMODEL_LARGE:
13921            sorry ("code model %qs with %<-f%s%>", "large",
13922                   opts->x_flag_pic > 1 ? "PIC" : "pic");
13923            break;
13924          default:
13925            gcc_unreachable ();
13926          }
13927      }
13928    else
13929      aarch64_cmodel = opts->x_aarch64_cmodel_var;
13930 }
13931
13932 /* Implement TARGET_OPTION_SAVE.  */
13933
13934 static void
13935 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
13936 {
13937   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
13938   ptr->x_aarch64_branch_protection_string
13939     = opts->x_aarch64_branch_protection_string;
13940 }
13941
13942 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
13943    using the information saved in PTR.  */
13944
13945 static void
13946 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
13947 {
13948   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
13949   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
13950   opts->x_explicit_arch = ptr->x_explicit_arch;
13951   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
13952   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
13953   opts->x_aarch64_branch_protection_string
13954     = ptr->x_aarch64_branch_protection_string;
13955   if (opts->x_aarch64_branch_protection_string)
13956     {
13957       aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
13958                                         NULL);
13959     }
13960
13961   aarch64_override_options_internal (opts);
13962 }
13963
13964 /* Implement TARGET_OPTION_PRINT.  */
13965
13966 static void
13967 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
13968 {
13969   const struct processor *cpu
13970     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
13971   uint64_t isa_flags = ptr->x_aarch64_isa_flags;
13972   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
13973   std::string extension
13974     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
13975
13976   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
13977   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
13978            arch->name, extension.c_str ());
13979 }
13980
13981 static GTY(()) tree aarch64_previous_fndecl;
13982
13983 void
13984 aarch64_reset_previous_fndecl (void)
13985 {
13986   aarch64_previous_fndecl = NULL;
13987 }
13988
13989 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
13990    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
13991    make sure optab availability predicates are recomputed when necessary.  */
13992
13993 void
13994 aarch64_save_restore_target_globals (tree new_tree)
13995 {
13996   if (TREE_TARGET_GLOBALS (new_tree))
13997     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
13998   else if (new_tree == target_option_default_node)
13999     restore_target_globals (&default_target_globals);
14000   else
14001     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
14002 }
14003
14004 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
14005    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
14006    of the function, if such exists.  This function may be called multiple
14007    times on a single function so use aarch64_previous_fndecl to avoid
14008    setting up identical state.  */
14009
14010 static void
14011 aarch64_set_current_function (tree fndecl)
14012 {
14013   if (!fndecl || fndecl == aarch64_previous_fndecl)
14014     return;
14015
14016   tree old_tree = (aarch64_previous_fndecl
14017                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
14018                    : NULL_TREE);
14019
14020   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14021
14022   /* If current function has no attributes but the previous one did,
14023      use the default node.  */
14024   if (!new_tree && old_tree)
14025     new_tree = target_option_default_node;
14026
14027   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
14028      the default have been handled by aarch64_save_restore_target_globals from
14029      aarch64_pragma_target_parse.  */
14030   if (old_tree == new_tree)
14031     return;
14032
14033   aarch64_previous_fndecl = fndecl;
14034
14035   /* First set the target options.  */
14036   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
14037
14038   aarch64_save_restore_target_globals (new_tree);
14039 }
14040
14041 /* Enum describing the various ways we can handle attributes.
14042    In many cases we can reuse the generic option handling machinery.  */
14043
14044 enum aarch64_attr_opt_type
14045 {
14046   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
14047   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
14048   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
14049   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
14050 };
14051
14052 /* All the information needed to handle a target attribute.
14053    NAME is the name of the attribute.
14054    ATTR_TYPE specifies the type of behavior of the attribute as described
14055    in the definition of enum aarch64_attr_opt_type.
14056    ALLOW_NEG is true if the attribute supports a "no-" form.
14057    HANDLER is the function that takes the attribute string as an argument
14058    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
14059    OPT_NUM is the enum specifying the option that the attribute modifies.
14060    This is needed for attributes that mirror the behavior of a command-line
14061    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
14062    aarch64_attr_enum.  */
14063
14064 struct aarch64_attribute_info
14065 {
14066   const char *name;
14067   enum aarch64_attr_opt_type attr_type;
14068   bool allow_neg;
14069   bool (*handler) (const char *);
14070   enum opt_code opt_num;
14071 };
14072
14073 /* Handle the ARCH_STR argument to the arch= target attribute.  */
14074
14075 static bool
14076 aarch64_handle_attr_arch (const char *str)
14077 {
14078   const struct processor *tmp_arch = NULL;
14079   std::string invalid_extension;
14080   enum aarch64_parse_opt_result parse_res
14081     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
14082
14083   if (parse_res == AARCH64_PARSE_OK)
14084     {
14085       gcc_assert (tmp_arch);
14086       selected_arch = tmp_arch;
14087       explicit_arch = selected_arch->arch;
14088       return true;
14089     }
14090
14091   switch (parse_res)
14092     {
14093       case AARCH64_PARSE_MISSING_ARG:
14094         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
14095         break;
14096       case AARCH64_PARSE_INVALID_ARG:
14097         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
14098         aarch64_print_hint_for_arch (str);
14099         break;
14100       case AARCH64_PARSE_INVALID_FEATURE:
14101         error ("invalid feature modifier %s of value (\"%s\") in "
14102                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
14103         aarch64_print_hint_for_extensions (invalid_extension);
14104         break;
14105       default:
14106         gcc_unreachable ();
14107     }
14108
14109   return false;
14110 }
14111
14112 /* Handle the argument CPU_STR to the cpu= target attribute.  */
14113
14114 static bool
14115 aarch64_handle_attr_cpu (const char *str)
14116 {
14117   const struct processor *tmp_cpu = NULL;
14118   std::string invalid_extension;
14119   enum aarch64_parse_opt_result parse_res
14120     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
14121
14122   if (parse_res == AARCH64_PARSE_OK)
14123     {
14124       gcc_assert (tmp_cpu);
14125       selected_tune = tmp_cpu;
14126       explicit_tune_core = selected_tune->ident;
14127
14128       selected_arch = &all_architectures[tmp_cpu->arch];
14129       explicit_arch = selected_arch->arch;
14130       return true;
14131     }
14132
14133   switch (parse_res)
14134     {
14135       case AARCH64_PARSE_MISSING_ARG:
14136         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
14137         break;
14138       case AARCH64_PARSE_INVALID_ARG:
14139         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
14140         aarch64_print_hint_for_core (str);
14141         break;
14142       case AARCH64_PARSE_INVALID_FEATURE:
14143         error ("invalid feature modifier %s of value (\"%s\") in "
14144                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
14145         aarch64_print_hint_for_extensions (invalid_extension);
14146         break;
14147       default:
14148         gcc_unreachable ();
14149     }
14150
14151   return false;
14152 }
14153
14154 /* Handle the argument STR to the branch-protection= attribute.  */
14155
14156  static bool
14157  aarch64_handle_attr_branch_protection (const char* str)
14158  {
14159   char *err_str = (char *) xmalloc (strlen (str));
14160   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
14161                                                                       &err_str);
14162   bool success = false;
14163   switch (res)
14164     {
14165      case AARCH64_PARSE_MISSING_ARG:
14166        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
14167               " attribute");
14168        break;
14169      case AARCH64_PARSE_INVALID_ARG:
14170        error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
14171               "=\")%> pragma or attribute", err_str);
14172        break;
14173      case AARCH64_PARSE_OK:
14174        success = true;
14175       /* Fall through.  */
14176      case AARCH64_PARSE_INVALID_FEATURE:
14177        break;
14178      default:
14179        gcc_unreachable ();
14180     }
14181   free (err_str);
14182   return success;
14183  }
14184
14185 /* Handle the argument STR to the tune= target attribute.  */
14186
14187 static bool
14188 aarch64_handle_attr_tune (const char *str)
14189 {
14190   const struct processor *tmp_tune = NULL;
14191   enum aarch64_parse_opt_result parse_res
14192     = aarch64_parse_tune (str, &tmp_tune);
14193
14194   if (parse_res == AARCH64_PARSE_OK)
14195     {
14196       gcc_assert (tmp_tune);
14197       selected_tune = tmp_tune;
14198       explicit_tune_core = selected_tune->ident;
14199       return true;
14200     }
14201
14202   switch (parse_res)
14203     {
14204       case AARCH64_PARSE_INVALID_ARG:
14205         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
14206         aarch64_print_hint_for_core (str);
14207         break;
14208       default:
14209         gcc_unreachable ();
14210     }
14211
14212   return false;
14213 }
14214
14215 /* Parse an architecture extensions target attribute string specified in STR.
14216    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
14217    if successful.  Update aarch64_isa_flags to reflect the ISA features
14218    modified.  */
14219
14220 static bool
14221 aarch64_handle_attr_isa_flags (char *str)
14222 {
14223   enum aarch64_parse_opt_result parse_res;
14224   uint64_t isa_flags = aarch64_isa_flags;
14225
14226   /* We allow "+nothing" in the beginning to clear out all architectural
14227      features if the user wants to handpick specific features.  */
14228   if (strncmp ("+nothing", str, 8) == 0)
14229     {
14230       isa_flags = 0;
14231       str += 8;
14232     }
14233
14234   std::string invalid_extension;
14235   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
14236
14237   if (parse_res == AARCH64_PARSE_OK)
14238     {
14239       aarch64_isa_flags = isa_flags;
14240       return true;
14241     }
14242
14243   switch (parse_res)
14244     {
14245       case AARCH64_PARSE_MISSING_ARG:
14246         error ("missing value in %<target()%> pragma or attribute");
14247         break;
14248
14249       case AARCH64_PARSE_INVALID_FEATURE:
14250         error ("invalid feature modifier %s of value (\"%s\") in "
14251                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
14252         break;
14253
14254       default:
14255         gcc_unreachable ();
14256     }
14257
14258  return false;
14259 }
14260
14261 /* The target attributes that we support.  On top of these we also support just
14262    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
14263    handled explicitly in aarch64_process_one_target_attr.  */
14264
14265 static const struct aarch64_attribute_info aarch64_attributes[] =
14266 {
14267   { "general-regs-only", aarch64_attr_mask, false, NULL,
14268      OPT_mgeneral_regs_only },
14269   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
14270      OPT_mfix_cortex_a53_835769 },
14271   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
14272      OPT_mfix_cortex_a53_843419 },
14273   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
14274   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
14275   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
14276      OPT_momit_leaf_frame_pointer },
14277   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
14278   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
14279      OPT_march_ },
14280   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
14281   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
14282      OPT_mtune_ },
14283   { "branch-protection", aarch64_attr_custom, false,
14284      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
14285   { "sign-return-address", aarch64_attr_enum, false, NULL,
14286      OPT_msign_return_address_ },
14287   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
14288 };
14289
14290 /* Parse ARG_STR which contains the definition of one target attribute.
14291    Show appropriate errors if any or return true if the attribute is valid.  */
14292
14293 static bool
14294 aarch64_process_one_target_attr (char *arg_str)
14295 {
14296   bool invert = false;
14297
14298   size_t len = strlen (arg_str);
14299
14300   if (len == 0)
14301     {
14302       error ("malformed %<target()%> pragma or attribute");
14303       return false;
14304     }
14305
14306   char *str_to_check = (char *) alloca (len + 1);
14307   strcpy (str_to_check, arg_str);
14308
14309   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
14310      It is easier to detect and handle it explicitly here rather than going
14311      through the machinery for the rest of the target attributes in this
14312      function.  */
14313   if (*str_to_check == '+')
14314     return aarch64_handle_attr_isa_flags (str_to_check);
14315
14316   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
14317     {
14318       invert = true;
14319       str_to_check += 3;
14320     }
14321   char *arg = strchr (str_to_check, '=');
14322
14323   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
14324      and point ARG to "foo".  */
14325   if (arg)
14326     {
14327       *arg = '\0';
14328       arg++;
14329     }
14330   const struct aarch64_attribute_info *p_attr;
14331   bool found = false;
14332   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
14333     {
14334       /* If the names don't match up, or the user has given an argument
14335          to an attribute that doesn't accept one, or didn't give an argument
14336          to an attribute that expects one, fail to match.  */
14337       if (strcmp (str_to_check, p_attr->name) != 0)
14338         continue;
14339
14340       found = true;
14341       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
14342                               || p_attr->attr_type == aarch64_attr_enum;
14343
14344       if (attr_need_arg_p ^ (arg != NULL))
14345         {
14346           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
14347           return false;
14348         }
14349
14350       /* If the name matches but the attribute does not allow "no-" versions
14351          then we can't match.  */
14352       if (invert && !p_attr->allow_neg)
14353         {
14354           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
14355           return false;
14356         }
14357
14358       switch (p_attr->attr_type)
14359         {
14360         /* Has a custom handler registered.
14361            For example, cpu=, arch=, tune=.  */
14362           case aarch64_attr_custom:
14363             gcc_assert (p_attr->handler);
14364             if (!p_attr->handler (arg))
14365               return false;
14366             break;
14367
14368           /* Either set or unset a boolean option.  */
14369           case aarch64_attr_bool:
14370             {
14371               struct cl_decoded_option decoded;
14372
14373               generate_option (p_attr->opt_num, NULL, !invert,
14374                                CL_TARGET, &decoded);
14375               aarch64_handle_option (&global_options, &global_options_set,
14376                                       &decoded, input_location);
14377               break;
14378             }
14379           /* Set or unset a bit in the target_flags.  aarch64_handle_option
14380              should know what mask to apply given the option number.  */
14381           case aarch64_attr_mask:
14382             {
14383               struct cl_decoded_option decoded;
14384               /* We only need to specify the option number.
14385                  aarch64_handle_option will know which mask to apply.  */
14386               decoded.opt_index = p_attr->opt_num;
14387               decoded.value = !invert;
14388               aarch64_handle_option (&global_options, &global_options_set,
14389                                       &decoded, input_location);
14390               break;
14391             }
14392           /* Use the option setting machinery to set an option to an enum.  */
14393           case aarch64_attr_enum:
14394             {
14395               gcc_assert (arg);
14396               bool valid;
14397               int value;
14398               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
14399                                               &value, CL_TARGET);
14400               if (valid)
14401                 {
14402                   set_option (&global_options, NULL, p_attr->opt_num, value,
14403                               NULL, DK_UNSPECIFIED, input_location,
14404                               global_dc);
14405                 }
14406               else
14407                 {
14408                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
14409                 }
14410               break;
14411             }
14412           default:
14413             gcc_unreachable ();
14414         }
14415     }
14416
14417   /* If we reached here we either have found an attribute and validated
14418      it or didn't match any.  If we matched an attribute but its arguments
14419      were malformed we will have returned false already.  */
14420   return found;
14421 }
14422
14423 /* Count how many times the character C appears in
14424    NULL-terminated string STR.  */
14425
14426 static unsigned int
14427 num_occurences_in_str (char c, char *str)
14428 {
14429   unsigned int res = 0;
14430   while (*str != '\0')
14431     {
14432       if (*str == c)
14433         res++;
14434
14435       str++;
14436     }
14437
14438   return res;
14439 }
14440
14441 /* Parse the tree in ARGS that contains the target attribute information
14442    and update the global target options space.  */
14443
14444 bool
14445 aarch64_process_target_attr (tree args)
14446 {
14447   if (TREE_CODE (args) == TREE_LIST)
14448     {
14449       do
14450         {
14451           tree head = TREE_VALUE (args);
14452           if (head)
14453             {
14454               if (!aarch64_process_target_attr (head))
14455                 return false;
14456             }
14457           args = TREE_CHAIN (args);
14458         } while (args);
14459
14460       return true;
14461     }
14462
14463   if (TREE_CODE (args) != STRING_CST)
14464     {
14465       error ("attribute %<target%> argument not a string");
14466       return false;
14467     }
14468
14469   size_t len = strlen (TREE_STRING_POINTER (args));
14470   char *str_to_check = (char *) alloca (len + 1);
14471   strcpy (str_to_check, TREE_STRING_POINTER (args));
14472
14473   if (len == 0)
14474     {
14475       error ("malformed %<target()%> pragma or attribute");
14476       return false;
14477     }
14478
14479   /* Used to catch empty spaces between commas i.e.
14480      attribute ((target ("attr1,,attr2"))).  */
14481   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
14482
14483   /* Handle multiple target attributes separated by ','.  */
14484   char *token = strtok_r (str_to_check, ",", &str_to_check);
14485
14486   unsigned int num_attrs = 0;
14487   while (token)
14488     {
14489       num_attrs++;
14490       if (!aarch64_process_one_target_attr (token))
14491         {
14492           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
14493           return false;
14494         }
14495
14496       token = strtok_r (NULL, ",", &str_to_check);
14497     }
14498
14499   if (num_attrs != num_commas + 1)
14500     {
14501       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
14502       return false;
14503     }
14504
14505   return true;
14506 }
14507
14508 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
14509    process attribute ((target ("..."))).  */
14510
14511 static bool
14512 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
14513 {
14514   struct cl_target_option cur_target;
14515   bool ret;
14516   tree old_optimize;
14517   tree new_target, new_optimize;
14518   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14519
14520   /* If what we're processing is the current pragma string then the
14521      target option node is already stored in target_option_current_node
14522      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
14523      having to re-parse the string.  This is especially useful to keep
14524      arm_neon.h compile times down since that header contains a lot
14525      of intrinsics enclosed in pragmas.  */
14526   if (!existing_target && args == current_target_pragma)
14527     {
14528       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
14529       return true;
14530     }
14531   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
14532
14533   old_optimize = build_optimization_node (&global_options);
14534   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
14535
14536   /* If the function changed the optimization levels as well as setting
14537      target options, start with the optimizations specified.  */
14538   if (func_optimize && func_optimize != old_optimize)
14539     cl_optimization_restore (&global_options,
14540                              TREE_OPTIMIZATION (func_optimize));
14541
14542   /* Save the current target options to restore at the end.  */
14543   cl_target_option_save (&cur_target, &global_options);
14544
14545   /* If fndecl already has some target attributes applied to it, unpack
14546      them so that we add this attribute on top of them, rather than
14547      overwriting them.  */
14548   if (existing_target)
14549     {
14550       struct cl_target_option *existing_options
14551         = TREE_TARGET_OPTION (existing_target);
14552
14553       if (existing_options)
14554         cl_target_option_restore (&global_options, existing_options);
14555     }
14556   else
14557     cl_target_option_restore (&global_options,
14558                         TREE_TARGET_OPTION (target_option_current_node));
14559
14560   ret = aarch64_process_target_attr (args);
14561
14562   /* Set up any additional state.  */
14563   if (ret)
14564     {
14565       aarch64_override_options_internal (&global_options);
14566       /* Initialize SIMD builtins if we haven't already.
14567          Set current_target_pragma to NULL for the duration so that
14568          the builtin initialization code doesn't try to tag the functions
14569          being built with the attributes specified by any current pragma, thus
14570          going into an infinite recursion.  */
14571       if (TARGET_SIMD)
14572         {
14573           tree saved_current_target_pragma = current_target_pragma;
14574           current_target_pragma = NULL;
14575           aarch64_init_simd_builtins ();
14576           current_target_pragma = saved_current_target_pragma;
14577         }
14578       new_target = build_target_option_node (&global_options);
14579     }
14580   else
14581     new_target = NULL;
14582
14583   new_optimize = build_optimization_node (&global_options);
14584
14585   if (fndecl && ret)
14586     {
14587       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
14588
14589       if (old_optimize != new_optimize)
14590         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
14591     }
14592
14593   cl_target_option_restore (&global_options, &cur_target);
14594
14595   if (old_optimize != new_optimize)
14596     cl_optimization_restore (&global_options,
14597                              TREE_OPTIMIZATION (old_optimize));
14598   return ret;
14599 }
14600
14601 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
14602    tri-bool options (yes, no, don't care) and the default value is
14603    DEF, determine whether to reject inlining.  */
14604
14605 static bool
14606 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
14607                                      int dont_care, int def)
14608 {
14609   /* If the callee doesn't care, always allow inlining.  */
14610   if (callee == dont_care)
14611     return true;
14612
14613   /* If the caller doesn't care, always allow inlining.  */
14614   if (caller == dont_care)
14615     return true;
14616
14617   /* Otherwise, allow inlining if either the callee and caller values
14618      agree, or if the callee is using the default value.  */
14619   return (callee == caller || callee == def);
14620 }
14621
14622 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
14623    to inline CALLEE into CALLER based on target-specific info.
14624    Make sure that the caller and callee have compatible architectural
14625    features.  Then go through the other possible target attributes
14626    and see if they can block inlining.  Try not to reject always_inline
14627    callees unless they are incompatible architecturally.  */
14628
14629 static bool
14630 aarch64_can_inline_p (tree caller, tree callee)
14631 {
14632   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
14633   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
14634
14635   struct cl_target_option *caller_opts
14636         = TREE_TARGET_OPTION (caller_tree ? caller_tree
14637                                            : target_option_default_node);
14638
14639   struct cl_target_option *callee_opts
14640         = TREE_TARGET_OPTION (callee_tree ? callee_tree
14641                                            : target_option_default_node);
14642
14643   /* Callee's ISA flags should be a subset of the caller's.  */
14644   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
14645        != callee_opts->x_aarch64_isa_flags)
14646     return false;
14647
14648   /* Allow non-strict aligned functions inlining into strict
14649      aligned ones.  */
14650   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
14651        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
14652       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
14653            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
14654     return false;
14655
14656   bool always_inline = lookup_attribute ("always_inline",
14657                                           DECL_ATTRIBUTES (callee));
14658
14659   /* If the architectural features match up and the callee is always_inline
14660      then the other attributes don't matter.  */
14661   if (always_inline)
14662     return true;
14663
14664   if (caller_opts->x_aarch64_cmodel_var
14665       != callee_opts->x_aarch64_cmodel_var)
14666     return false;
14667
14668   if (caller_opts->x_aarch64_tls_dialect
14669       != callee_opts->x_aarch64_tls_dialect)
14670     return false;
14671
14672   /* Honour explicit requests to workaround errata.  */
14673   if (!aarch64_tribools_ok_for_inlining_p (
14674           caller_opts->x_aarch64_fix_a53_err835769,
14675           callee_opts->x_aarch64_fix_a53_err835769,
14676           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
14677     return false;
14678
14679   if (!aarch64_tribools_ok_for_inlining_p (
14680           caller_opts->x_aarch64_fix_a53_err843419,
14681           callee_opts->x_aarch64_fix_a53_err843419,
14682           2, TARGET_FIX_ERR_A53_843419))
14683     return false;
14684
14685   /* If the user explicitly specified -momit-leaf-frame-pointer for the
14686      caller and calle and they don't match up, reject inlining.  */
14687   if (!aarch64_tribools_ok_for_inlining_p (
14688           caller_opts->x_flag_omit_leaf_frame_pointer,
14689           callee_opts->x_flag_omit_leaf_frame_pointer,
14690           2, 1))
14691     return false;
14692
14693   /* If the callee has specific tuning overrides, respect them.  */
14694   if (callee_opts->x_aarch64_override_tune_string != NULL
14695       && caller_opts->x_aarch64_override_tune_string == NULL)
14696     return false;
14697
14698   /* If the user specified tuning override strings for the
14699      caller and callee and they don't match up, reject inlining.
14700      We just do a string compare here, we don't analyze the meaning
14701      of the string, as it would be too costly for little gain.  */
14702   if (callee_opts->x_aarch64_override_tune_string
14703       && caller_opts->x_aarch64_override_tune_string
14704       && (strcmp (callee_opts->x_aarch64_override_tune_string,
14705                   caller_opts->x_aarch64_override_tune_string) != 0))
14706     return false;
14707
14708   return true;
14709 }
14710
14711 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
14712    been already.  */
14713
14714 unsigned int
14715 aarch64_tlsdesc_abi_id ()
14716 {
14717   predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
14718   if (!tlsdesc_abi.initialized_p ())
14719     {
14720       HARD_REG_SET full_reg_clobbers;
14721       CLEAR_HARD_REG_SET (full_reg_clobbers);
14722       SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
14723       SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
14724       for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
14725         SET_HARD_REG_BIT (full_reg_clobbers, regno);
14726       tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
14727     }
14728   return tlsdesc_abi.id ();
14729 }
14730
14731 /* Return true if SYMBOL_REF X binds locally.  */
14732
14733 static bool
14734 aarch64_symbol_binds_local_p (const_rtx x)
14735 {
14736   return (SYMBOL_REF_DECL (x)
14737           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
14738           : SYMBOL_REF_LOCAL_P (x));
14739 }
14740
14741 /* Return true if SYMBOL_REF X is thread local */
14742 static bool
14743 aarch64_tls_symbol_p (rtx x)
14744 {
14745   if (! TARGET_HAVE_TLS)
14746     return false;
14747
14748   if (GET_CODE (x) != SYMBOL_REF)
14749     return false;
14750
14751   return SYMBOL_REF_TLS_MODEL (x) != 0;
14752 }
14753
14754 /* Classify a TLS symbol into one of the TLS kinds.  */
14755 enum aarch64_symbol_type
14756 aarch64_classify_tls_symbol (rtx x)
14757 {
14758   enum tls_model tls_kind = tls_symbolic_operand_type (x);
14759
14760   switch (tls_kind)
14761     {
14762     case TLS_MODEL_GLOBAL_DYNAMIC:
14763     case TLS_MODEL_LOCAL_DYNAMIC:
14764       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
14765
14766     case TLS_MODEL_INITIAL_EXEC:
14767       switch (aarch64_cmodel)
14768         {
14769         case AARCH64_CMODEL_TINY:
14770         case AARCH64_CMODEL_TINY_PIC:
14771           return SYMBOL_TINY_TLSIE;
14772         default:
14773           return SYMBOL_SMALL_TLSIE;
14774         }
14775
14776     case TLS_MODEL_LOCAL_EXEC:
14777       if (aarch64_tls_size == 12)
14778         return SYMBOL_TLSLE12;
14779       else if (aarch64_tls_size == 24)
14780         return SYMBOL_TLSLE24;
14781       else if (aarch64_tls_size == 32)
14782         return SYMBOL_TLSLE32;
14783       else if (aarch64_tls_size == 48)
14784         return SYMBOL_TLSLE48;
14785       else
14786         gcc_unreachable ();
14787
14788     case TLS_MODEL_EMULATED:
14789     case TLS_MODEL_NONE:
14790       return SYMBOL_FORCE_TO_MEM;
14791
14792     default:
14793       gcc_unreachable ();
14794     }
14795 }
14796
14797 /* Return the correct method for accessing X + OFFSET, where X is either
14798    a SYMBOL_REF or LABEL_REF.  */
14799
14800 enum aarch64_symbol_type
14801 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
14802 {
14803   if (GET_CODE (x) == LABEL_REF)
14804     {
14805       switch (aarch64_cmodel)
14806         {
14807         case AARCH64_CMODEL_LARGE:
14808           return SYMBOL_FORCE_TO_MEM;
14809
14810         case AARCH64_CMODEL_TINY_PIC:
14811         case AARCH64_CMODEL_TINY:
14812           return SYMBOL_TINY_ABSOLUTE;
14813
14814         case AARCH64_CMODEL_SMALL_SPIC:
14815         case AARCH64_CMODEL_SMALL_PIC:
14816         case AARCH64_CMODEL_SMALL:
14817           return SYMBOL_SMALL_ABSOLUTE;
14818
14819         default:
14820           gcc_unreachable ();
14821         }
14822     }
14823
14824   if (GET_CODE (x) == SYMBOL_REF)
14825     {
14826       if (aarch64_tls_symbol_p (x))
14827         return aarch64_classify_tls_symbol (x);
14828
14829       switch (aarch64_cmodel)
14830         {
14831         case AARCH64_CMODEL_TINY:
14832           /* When we retrieve symbol + offset address, we have to make sure
14833              the offset does not cause overflow of the final address.  But
14834              we have no way of knowing the address of symbol at compile time
14835              so we can't accurately say if the distance between the PC and
14836              symbol + offset is outside the addressible range of +/-1MB in the
14837              TINY code model.  So we limit the maximum offset to +/-64KB and
14838              assume the offset to the symbol is not larger than +/-(1MB - 64KB).
14839              If offset_within_block_p is true we allow larger offsets.
14840              Furthermore force to memory if the symbol is a weak reference to
14841              something that doesn't resolve to a symbol in this module.  */
14842
14843           if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
14844             return SYMBOL_FORCE_TO_MEM;
14845           if (!(IN_RANGE (offset, -0x10000, 0x10000)
14846                 || offset_within_block_p (x, offset)))
14847             return SYMBOL_FORCE_TO_MEM;
14848
14849           return SYMBOL_TINY_ABSOLUTE;
14850
14851         case AARCH64_CMODEL_SMALL:
14852           /* Same reasoning as the tiny code model, but the offset cap here is
14853              1MB, allowing +/-3.9GB for the offset to the symbol.  */
14854
14855           if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
14856             return SYMBOL_FORCE_TO_MEM;
14857           if (!(IN_RANGE (offset, -0x100000, 0x100000)
14858                 || offset_within_block_p (x, offset)))
14859             return SYMBOL_FORCE_TO_MEM;
14860
14861           return SYMBOL_SMALL_ABSOLUTE;
14862
14863         case AARCH64_CMODEL_TINY_PIC:
14864           if (!aarch64_symbol_binds_local_p (x))
14865             return SYMBOL_TINY_GOT;
14866           return SYMBOL_TINY_ABSOLUTE;
14867
14868         case AARCH64_CMODEL_SMALL_SPIC:
14869         case AARCH64_CMODEL_SMALL_PIC:
14870           if (!aarch64_symbol_binds_local_p (x))
14871             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
14872                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
14873           return SYMBOL_SMALL_ABSOLUTE;
14874
14875         case AARCH64_CMODEL_LARGE:
14876           /* This is alright even in PIC code as the constant
14877              pool reference is always PC relative and within
14878              the same translation unit.  */
14879           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
14880             return SYMBOL_SMALL_ABSOLUTE;
14881           else
14882             return SYMBOL_FORCE_TO_MEM;
14883
14884         default:
14885           gcc_unreachable ();
14886         }
14887     }
14888
14889   /* By default push everything into the constant pool.  */
14890   return SYMBOL_FORCE_TO_MEM;
14891 }
14892
14893 bool
14894 aarch64_constant_address_p (rtx x)
14895 {
14896   return (CONSTANT_P (x) && memory_address_p (DImode, x));
14897 }
14898
14899 bool
14900 aarch64_legitimate_pic_operand_p (rtx x)
14901 {
14902   if (GET_CODE (x) == SYMBOL_REF
14903       || (GET_CODE (x) == CONST
14904           && GET_CODE (XEXP (x, 0)) == PLUS
14905           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
14906      return false;
14907
14908   return true;
14909 }
14910
14911 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
14912    that should be rematerialized rather than spilled.  */
14913
14914 static bool
14915 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
14916 {
14917   /* Support CSE and rematerialization of common constants.  */
14918   if (CONST_INT_P (x)
14919       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
14920       || GET_CODE (x) == CONST_VECTOR)
14921     return true;
14922
14923   /* Do not allow vector struct mode constants for Advanced SIMD.
14924      We could support 0 and -1 easily, but they need support in
14925      aarch64-simd.md.  */
14926   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14927   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
14928     return false;
14929
14930   /* Only accept variable-length vector constants if they can be
14931      handled directly.
14932
14933      ??? It would be possible to handle rematerialization of other
14934      constants via secondary reloads.  */
14935   if (vec_flags & VEC_ANY_SVE)
14936     return aarch64_simd_valid_immediate (x, NULL);
14937
14938   if (GET_CODE (x) == HIGH)
14939     x = XEXP (x, 0);
14940
14941   /* Accept polynomial constants that can be calculated by using the
14942      destination of a move as the sole temporary.  Constants that
14943      require a second temporary cannot be rematerialized (they can't be
14944      forced to memory and also aren't legitimate constants).  */
14945   poly_int64 offset;
14946   if (poly_int_rtx_p (x, &offset))
14947     return aarch64_offset_temporaries (false, offset) <= 1;
14948
14949   /* If an offset is being added to something else, we need to allow the
14950      base to be moved into the destination register, meaning that there
14951      are no free temporaries for the offset.  */
14952   x = strip_offset (x, &offset);
14953   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
14954     return false;
14955
14956   /* Do not allow const (plus (anchor_symbol, const_int)).  */
14957   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
14958     return false;
14959
14960   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
14961      so spilling them is better than rematerialization.  */
14962   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
14963     return true;
14964
14965   /* Label references are always constant.  */
14966   if (GET_CODE (x) == LABEL_REF)
14967     return true;
14968
14969   return false;
14970 }
14971
14972 rtx
14973 aarch64_load_tp (rtx target)
14974 {
14975   if (!target
14976       || GET_MODE (target) != Pmode
14977       || !register_operand (target, Pmode))
14978     target = gen_reg_rtx (Pmode);
14979
14980   /* Can return in any reg.  */
14981   emit_insn (gen_aarch64_load_tp_hard (target));
14982   return target;
14983 }
14984
14985 /* On AAPCS systems, this is the "struct __va_list".  */
14986 static GTY(()) tree va_list_type;
14987
14988 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
14989    Return the type to use as __builtin_va_list.
14990
14991    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
14992
14993    struct __va_list
14994    {
14995      void *__stack;
14996      void *__gr_top;
14997      void *__vr_top;
14998      int   __gr_offs;
14999      int   __vr_offs;
15000    };  */
15001
15002 static tree
15003 aarch64_build_builtin_va_list (void)
15004 {
15005   tree va_list_name;
15006   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
15007
15008   /* Create the type.  */
15009   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
15010   /* Give it the required name.  */
15011   va_list_name = build_decl (BUILTINS_LOCATION,
15012                              TYPE_DECL,
15013                              get_identifier ("__va_list"),
15014                              va_list_type);
15015   DECL_ARTIFICIAL (va_list_name) = 1;
15016   TYPE_NAME (va_list_type) = va_list_name;
15017   TYPE_STUB_DECL (va_list_type) = va_list_name;
15018
15019   /* Create the fields.  */
15020   f_stack = build_decl (BUILTINS_LOCATION,
15021                         FIELD_DECL, get_identifier ("__stack"),
15022                         ptr_type_node);
15023   f_grtop = build_decl (BUILTINS_LOCATION,
15024                         FIELD_DECL, get_identifier ("__gr_top"),
15025                         ptr_type_node);
15026   f_vrtop = build_decl (BUILTINS_LOCATION,
15027                         FIELD_DECL, get_identifier ("__vr_top"),
15028                         ptr_type_node);
15029   f_groff = build_decl (BUILTINS_LOCATION,
15030                         FIELD_DECL, get_identifier ("__gr_offs"),
15031                         integer_type_node);
15032   f_vroff = build_decl (BUILTINS_LOCATION,
15033                         FIELD_DECL, get_identifier ("__vr_offs"),
15034                         integer_type_node);
15035
15036   /* Tell tree-stdarg pass about our internal offset fields.
15037      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
15038      purpose to identify whether the code is updating va_list internal
15039      offset fields through irregular way.  */
15040   va_list_gpr_counter_field = f_groff;
15041   va_list_fpr_counter_field = f_vroff;
15042
15043   DECL_ARTIFICIAL (f_stack) = 1;
15044   DECL_ARTIFICIAL (f_grtop) = 1;
15045   DECL_ARTIFICIAL (f_vrtop) = 1;
15046   DECL_ARTIFICIAL (f_groff) = 1;
15047   DECL_ARTIFICIAL (f_vroff) = 1;
15048
15049   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
15050   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
15051   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
15052   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
15053   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
15054
15055   TYPE_FIELDS (va_list_type) = f_stack;
15056   DECL_CHAIN (f_stack) = f_grtop;
15057   DECL_CHAIN (f_grtop) = f_vrtop;
15058   DECL_CHAIN (f_vrtop) = f_groff;
15059   DECL_CHAIN (f_groff) = f_vroff;
15060
15061   /* Compute its layout.  */
15062   layout_type (va_list_type);
15063
15064   return va_list_type;
15065 }
15066
15067 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
15068 static void
15069 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
15070 {
15071   const CUMULATIVE_ARGS *cum;
15072   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
15073   tree stack, grtop, vrtop, groff, vroff;
15074   tree t;
15075   int gr_save_area_size = cfun->va_list_gpr_size;
15076   int vr_save_area_size = cfun->va_list_fpr_size;
15077   int vr_offset;
15078
15079   cum = &crtl->args.info;
15080   if (cfun->va_list_gpr_size)
15081     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
15082                              cfun->va_list_gpr_size);
15083   if (cfun->va_list_fpr_size)
15084     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
15085                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
15086
15087   if (!TARGET_FLOAT)
15088     {
15089       gcc_assert (cum->aapcs_nvrn == 0);
15090       vr_save_area_size = 0;
15091     }
15092
15093   f_stack = TYPE_FIELDS (va_list_type_node);
15094   f_grtop = DECL_CHAIN (f_stack);
15095   f_vrtop = DECL_CHAIN (f_grtop);
15096   f_groff = DECL_CHAIN (f_vrtop);
15097   f_vroff = DECL_CHAIN (f_groff);
15098
15099   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
15100                   NULL_TREE);
15101   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
15102                   NULL_TREE);
15103   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
15104                   NULL_TREE);
15105   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
15106                   NULL_TREE);
15107   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
15108                   NULL_TREE);
15109
15110   /* Emit code to initialize STACK, which points to the next varargs stack
15111      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
15112      by named arguments.  STACK is 8-byte aligned.  */
15113   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
15114   if (cum->aapcs_stack_size > 0)
15115     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
15116   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
15117   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15118
15119   /* Emit code to initialize GRTOP, the top of the GR save area.
15120      virtual_incoming_args_rtx should have been 16 byte aligned.  */
15121   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
15122   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
15123   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15124
15125   /* Emit code to initialize VRTOP, the top of the VR save area.
15126      This address is gr_save_area_bytes below GRTOP, rounded
15127      down to the next 16-byte boundary.  */
15128   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
15129   vr_offset = ROUND_UP (gr_save_area_size,
15130                         STACK_BOUNDARY / BITS_PER_UNIT);
15131
15132   if (vr_offset)
15133     t = fold_build_pointer_plus_hwi (t, -vr_offset);
15134   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
15135   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15136
15137   /* Emit code to initialize GROFF, the offset from GRTOP of the
15138      next GPR argument.  */
15139   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
15140               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
15141   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15142
15143   /* Likewise emit code to initialize VROFF, the offset from FTOP
15144      of the next VR argument.  */
15145   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
15146               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
15147   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15148 }
15149
15150 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
15151
15152 static tree
15153 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
15154                               gimple_seq *post_p ATTRIBUTE_UNUSED)
15155 {
15156   tree addr;
15157   bool indirect_p;
15158   bool is_ha;           /* is HFA or HVA.  */
15159   bool dw_align;        /* double-word align.  */
15160   machine_mode ag_mode = VOIDmode;
15161   int nregs;
15162   machine_mode mode;
15163
15164   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
15165   tree stack, f_top, f_off, off, arg, roundup, on_stack;
15166   HOST_WIDE_INT size, rsize, adjust, align;
15167   tree t, u, cond1, cond2;
15168
15169   indirect_p = pass_va_arg_by_reference (type);
15170   if (indirect_p)
15171     type = build_pointer_type (type);
15172
15173   mode = TYPE_MODE (type);
15174
15175   f_stack = TYPE_FIELDS (va_list_type_node);
15176   f_grtop = DECL_CHAIN (f_stack);
15177   f_vrtop = DECL_CHAIN (f_grtop);
15178   f_groff = DECL_CHAIN (f_vrtop);
15179   f_vroff = DECL_CHAIN (f_groff);
15180
15181   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
15182                   f_stack, NULL_TREE);
15183   size = int_size_in_bytes (type);
15184
15185   bool abi_break;
15186   align
15187     = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
15188
15189   dw_align = false;
15190   adjust = 0;
15191   if (aarch64_vfp_is_call_or_return_candidate (mode,
15192                                                type,
15193                                                &ag_mode,
15194                                                &nregs,
15195                                                &is_ha))
15196     {
15197       /* No frontends can create types with variable-sized modes, so we
15198          shouldn't be asked to pass or return them.  */
15199       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
15200
15201       /* TYPE passed in fp/simd registers.  */
15202       if (!TARGET_FLOAT)
15203         aarch64_err_no_fpadvsimd (mode);
15204
15205       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
15206                       unshare_expr (valist), f_vrtop, NULL_TREE);
15207       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
15208                       unshare_expr (valist), f_vroff, NULL_TREE);
15209
15210       rsize = nregs * UNITS_PER_VREG;
15211
15212       if (is_ha)
15213         {
15214           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
15215             adjust = UNITS_PER_VREG - ag_size;
15216         }
15217       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
15218                && size < UNITS_PER_VREG)
15219         {
15220           adjust = UNITS_PER_VREG - size;
15221         }
15222     }
15223   else
15224     {
15225       /* TYPE passed in general registers.  */
15226       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
15227                       unshare_expr (valist), f_grtop, NULL_TREE);
15228       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
15229                       unshare_expr (valist), f_groff, NULL_TREE);
15230       rsize = ROUND_UP (size, UNITS_PER_WORD);
15231       nregs = rsize / UNITS_PER_WORD;
15232
15233       if (align > 8)
15234         {
15235           if (abi_break && warn_psabi)
15236             inform (input_location, "parameter passing for argument of type "
15237                     "%qT changed in GCC 9.1", type);
15238           dw_align = true;
15239         }
15240
15241       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
15242           && size < UNITS_PER_WORD)
15243         {
15244           adjust = UNITS_PER_WORD  - size;
15245         }
15246     }
15247
15248   /* Get a local temporary for the field value.  */
15249   off = get_initialized_tmp_var (f_off, pre_p, NULL);
15250
15251   /* Emit code to branch if off >= 0.  */
15252   t = build2 (GE_EXPR, boolean_type_node, off,
15253               build_int_cst (TREE_TYPE (off), 0));
15254   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
15255
15256   if (dw_align)
15257     {
15258       /* Emit: offs = (offs + 15) & -16.  */
15259       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
15260                   build_int_cst (TREE_TYPE (off), 15));
15261       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
15262                   build_int_cst (TREE_TYPE (off), -16));
15263       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
15264     }
15265   else
15266     roundup = NULL;
15267
15268   /* Update ap.__[g|v]r_offs  */
15269   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
15270               build_int_cst (TREE_TYPE (off), rsize));
15271   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
15272
15273   /* String up.  */
15274   if (roundup)
15275     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
15276
15277   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
15278   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
15279               build_int_cst (TREE_TYPE (f_off), 0));
15280   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
15281
15282   /* String up: make sure the assignment happens before the use.  */
15283   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
15284   COND_EXPR_ELSE (cond1) = t;
15285
15286   /* Prepare the trees handling the argument that is passed on the stack;
15287      the top level node will store in ON_STACK.  */
15288   arg = get_initialized_tmp_var (stack, pre_p, NULL);
15289   if (align > 8)
15290     {
15291       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
15292       t = fold_build_pointer_plus_hwi (arg, 15);
15293       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
15294                   build_int_cst (TREE_TYPE (t), -16));
15295       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
15296     }
15297   else
15298     roundup = NULL;
15299   /* Advance ap.__stack  */
15300   t = fold_build_pointer_plus_hwi (arg, size + 7);
15301   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
15302               build_int_cst (TREE_TYPE (t), -8));
15303   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
15304   /* String up roundup and advance.  */
15305   if (roundup)
15306     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
15307   /* String up with arg */
15308   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
15309   /* Big-endianness related address adjustment.  */
15310   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
15311       && size < UNITS_PER_WORD)
15312   {
15313     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
15314                 size_int (UNITS_PER_WORD - size));
15315     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
15316   }
15317
15318   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
15319   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
15320
15321   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
15322   t = off;
15323   if (adjust)
15324     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
15325                 build_int_cst (TREE_TYPE (off), adjust));
15326
15327   t = fold_convert (sizetype, t);
15328   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
15329
15330   if (is_ha)
15331     {
15332       /* type ha; // treat as "struct {ftype field[n];}"
15333          ... [computing offs]
15334          for (i = 0; i <nregs; ++i, offs += 16)
15335            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
15336          return ha;  */
15337       int i;
15338       tree tmp_ha, field_t, field_ptr_t;
15339
15340       /* Declare a local variable.  */
15341       tmp_ha = create_tmp_var_raw (type, "ha");
15342       gimple_add_tmp_var (tmp_ha);
15343
15344       /* Establish the base type.  */
15345       switch (ag_mode)
15346         {
15347         case E_SFmode:
15348           field_t = float_type_node;
15349           field_ptr_t = float_ptr_type_node;
15350           break;
15351         case E_DFmode:
15352           field_t = double_type_node;
15353           field_ptr_t = double_ptr_type_node;
15354           break;
15355         case E_TFmode:
15356           field_t = long_double_type_node;
15357           field_ptr_t = long_double_ptr_type_node;
15358           break;
15359         case E_HFmode:
15360           field_t = aarch64_fp16_type_node;
15361           field_ptr_t = aarch64_fp16_ptr_type_node;
15362           break;
15363         case E_V2SImode:
15364         case E_V4SImode:
15365             {
15366               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
15367               field_t = build_vector_type_for_mode (innertype, ag_mode);
15368               field_ptr_t = build_pointer_type (field_t);
15369             }
15370           break;
15371         default:
15372           gcc_assert (0);
15373         }
15374
15375       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
15376       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
15377       addr = t;
15378       t = fold_convert (field_ptr_t, addr);
15379       t = build2 (MODIFY_EXPR, field_t,
15380                   build1 (INDIRECT_REF, field_t, tmp_ha),
15381                   build1 (INDIRECT_REF, field_t, t));
15382
15383       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
15384       for (i = 1; i < nregs; ++i)
15385         {
15386           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
15387           u = fold_convert (field_ptr_t, addr);
15388           u = build2 (MODIFY_EXPR, field_t,
15389                       build2 (MEM_REF, field_t, tmp_ha,
15390                               build_int_cst (field_ptr_t,
15391                                              (i *
15392                                               int_size_in_bytes (field_t)))),
15393                       build1 (INDIRECT_REF, field_t, u));
15394           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
15395         }
15396
15397       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
15398       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
15399     }
15400
15401   COND_EXPR_ELSE (cond2) = t;
15402   addr = fold_convert (build_pointer_type (type), cond1);
15403   addr = build_va_arg_indirect_ref (addr);
15404
15405   if (indirect_p)
15406     addr = build_va_arg_indirect_ref (addr);
15407
15408   return addr;
15409 }
15410
15411 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
15412
15413 static void
15414 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
15415                                 const function_arg_info &arg,
15416                                 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
15417 {
15418   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
15419   CUMULATIVE_ARGS local_cum;
15420   int gr_saved = cfun->va_list_gpr_size;
15421   int vr_saved = cfun->va_list_fpr_size;
15422
15423   /* The caller has advanced CUM up to, but not beyond, the last named
15424      argument.  Advance a local copy of CUM past the last "real" named
15425      argument, to find out how many registers are left over.  */
15426   local_cum = *cum;
15427   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
15428
15429   /* Found out how many registers we need to save.
15430      Honor tree-stdvar analysis results.  */
15431   if (cfun->va_list_gpr_size)
15432     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
15433                     cfun->va_list_gpr_size / UNITS_PER_WORD);
15434   if (cfun->va_list_fpr_size)
15435     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
15436                     cfun->va_list_fpr_size / UNITS_PER_VREG);
15437
15438   if (!TARGET_FLOAT)
15439     {
15440       gcc_assert (local_cum.aapcs_nvrn == 0);
15441       vr_saved = 0;
15442     }
15443
15444   if (!no_rtl)
15445     {
15446       if (gr_saved > 0)
15447         {
15448           rtx ptr, mem;
15449
15450           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
15451           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
15452                                - gr_saved * UNITS_PER_WORD);
15453           mem = gen_frame_mem (BLKmode, ptr);
15454           set_mem_alias_set (mem, get_varargs_alias_set ());
15455
15456           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
15457                                mem, gr_saved);
15458         }
15459       if (vr_saved > 0)
15460         {
15461           /* We can't use move_block_from_reg, because it will use
15462              the wrong mode, storing D regs only.  */
15463           machine_mode mode = TImode;
15464           int off, i, vr_start;
15465
15466           /* Set OFF to the offset from virtual_incoming_args_rtx of
15467              the first vector register.  The VR save area lies below
15468              the GR one, and is aligned to 16 bytes.  */
15469           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
15470                            STACK_BOUNDARY / BITS_PER_UNIT);
15471           off -= vr_saved * UNITS_PER_VREG;
15472
15473           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
15474           for (i = 0; i < vr_saved; ++i)
15475             {
15476               rtx ptr, mem;
15477
15478               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
15479               mem = gen_frame_mem (mode, ptr);
15480               set_mem_alias_set (mem, get_varargs_alias_set ());
15481               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
15482               off += UNITS_PER_VREG;
15483             }
15484         }
15485     }
15486
15487   /* We don't save the size into *PRETEND_SIZE because we want to avoid
15488      any complication of having crtl->args.pretend_args_size changed.  */
15489   cfun->machine->frame.saved_varargs_size
15490     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
15491                  STACK_BOUNDARY / BITS_PER_UNIT)
15492        + vr_saved * UNITS_PER_VREG);
15493 }
15494
15495 static void
15496 aarch64_conditional_register_usage (void)
15497 {
15498   int i;
15499   if (!TARGET_FLOAT)
15500     {
15501       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
15502         {
15503           fixed_regs[i] = 1;
15504           call_used_regs[i] = 1;
15505         }
15506     }
15507   if (!TARGET_SVE)
15508     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
15509       {
15510         fixed_regs[i] = 1;
15511         call_used_regs[i] = 1;
15512       }
15513
15514   /* Only allow the FFR and FFRT to be accessed via special patterns.  */
15515   CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
15516   CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
15517
15518   /* When tracking speculation, we need a couple of call-clobbered registers
15519      to track the speculation state.  It would be nice to just use
15520      IP0 and IP1, but currently there are numerous places that just
15521      assume these registers are free for other uses (eg pointer
15522      authentication).  */
15523   if (aarch64_track_speculation)
15524     {
15525       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
15526       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
15527       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
15528       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
15529     }
15530 }
15531
15532 /* Walk down the type tree of TYPE counting consecutive base elements.
15533    If *MODEP is VOIDmode, then set it to the first valid floating point
15534    type.  If a non-floating point type is found, or if a floating point
15535    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
15536    otherwise return the count in the sub-tree.  */
15537 static int
15538 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
15539 {
15540   machine_mode mode;
15541   HOST_WIDE_INT size;
15542
15543   /* SVE types (and types containing SVE types) must be handled
15544      before calling this function.  */
15545   gcc_assert (!aarch64_sve::builtin_type_p (type));
15546
15547   switch (TREE_CODE (type))
15548     {
15549     case REAL_TYPE:
15550       mode = TYPE_MODE (type);
15551       if (mode != DFmode && mode != SFmode
15552           && mode != TFmode && mode != HFmode)
15553         return -1;
15554
15555       if (*modep == VOIDmode)
15556         *modep = mode;
15557
15558       if (*modep == mode)
15559         return 1;
15560
15561       break;
15562
15563     case COMPLEX_TYPE:
15564       mode = TYPE_MODE (TREE_TYPE (type));
15565       if (mode != DFmode && mode != SFmode
15566           && mode != TFmode && mode != HFmode)
15567         return -1;
15568
15569       if (*modep == VOIDmode)
15570         *modep = mode;
15571
15572       if (*modep == mode)
15573         return 2;
15574
15575       break;
15576
15577     case VECTOR_TYPE:
15578       /* Use V2SImode and V4SImode as representatives of all 64-bit
15579          and 128-bit vector types.  */
15580       size = int_size_in_bytes (type);
15581       switch (size)
15582         {
15583         case 8:
15584           mode = V2SImode;
15585           break;
15586         case 16:
15587           mode = V4SImode;
15588           break;
15589         default:
15590           return -1;
15591         }
15592
15593       if (*modep == VOIDmode)
15594         *modep = mode;
15595
15596       /* Vector modes are considered to be opaque: two vectors are
15597          equivalent for the purposes of being homogeneous aggregates
15598          if they are the same size.  */
15599       if (*modep == mode)
15600         return 1;
15601
15602       break;
15603
15604     case ARRAY_TYPE:
15605       {
15606         int count;
15607         tree index = TYPE_DOMAIN (type);
15608
15609         /* Can't handle incomplete types nor sizes that are not
15610            fixed.  */
15611         if (!COMPLETE_TYPE_P (type)
15612             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15613           return -1;
15614
15615         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
15616         if (count == -1
15617             || !index
15618             || !TYPE_MAX_VALUE (index)
15619             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
15620             || !TYPE_MIN_VALUE (index)
15621             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
15622             || count < 0)
15623           return -1;
15624
15625         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
15626                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
15627
15628         /* There must be no padding.  */
15629         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
15630                       count * GET_MODE_BITSIZE (*modep)))
15631           return -1;
15632
15633         return count;
15634       }
15635
15636     case RECORD_TYPE:
15637       {
15638         int count = 0;
15639         int sub_count;
15640         tree field;
15641
15642         /* Can't handle incomplete types nor sizes that are not
15643            fixed.  */
15644         if (!COMPLETE_TYPE_P (type)
15645             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15646           return -1;
15647
15648         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
15649           {
15650             if (TREE_CODE (field) != FIELD_DECL)
15651               continue;
15652
15653             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
15654             if (sub_count < 0)
15655               return -1;
15656             count += sub_count;
15657           }
15658
15659         /* There must be no padding.  */
15660         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
15661                       count * GET_MODE_BITSIZE (*modep)))
15662           return -1;
15663
15664         return count;
15665       }
15666
15667     case UNION_TYPE:
15668     case QUAL_UNION_TYPE:
15669       {
15670         /* These aren't very interesting except in a degenerate case.  */
15671         int count = 0;
15672         int sub_count;
15673         tree field;
15674
15675         /* Can't handle incomplete types nor sizes that are not
15676            fixed.  */
15677         if (!COMPLETE_TYPE_P (type)
15678             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15679           return -1;
15680
15681         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
15682           {
15683             if (TREE_CODE (field) != FIELD_DECL)
15684               continue;
15685
15686             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
15687             if (sub_count < 0)
15688               return -1;
15689             count = count > sub_count ? count : sub_count;
15690           }
15691
15692         /* There must be no padding.  */
15693         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
15694                       count * GET_MODE_BITSIZE (*modep)))
15695           return -1;
15696
15697         return count;
15698       }
15699
15700     default:
15701       break;
15702     }
15703
15704   return -1;
15705 }
15706
15707 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
15708    type as described in AAPCS64 \S 4.1.2.
15709
15710    See the comment above aarch64_composite_type_p for the notes on MODE.  */
15711
15712 static bool
15713 aarch64_short_vector_p (const_tree type,
15714                         machine_mode mode)
15715 {
15716   poly_int64 size = -1;
15717
15718   if (type && aarch64_sve::builtin_type_p (type))
15719     return false;
15720
15721   if (type && TREE_CODE (type) == VECTOR_TYPE)
15722     size = int_size_in_bytes (type);
15723   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
15724             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
15725     size = GET_MODE_SIZE (mode);
15726
15727   return known_eq (size, 8) || known_eq (size, 16);
15728 }
15729
15730 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
15731    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
15732    array types.  The C99 floating-point complex types are also considered
15733    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
15734    types, which are GCC extensions and out of the scope of AAPCS64, are
15735    treated as composite types here as well.
15736
15737    Note that MODE itself is not sufficient in determining whether a type
15738    is such a composite type or not.  This is because
15739    stor-layout.c:compute_record_mode may have already changed the MODE
15740    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
15741    structure with only one field may have its MODE set to the mode of the
15742    field.  Also an integer mode whose size matches the size of the
15743    RECORD_TYPE type may be used to substitute the original mode
15744    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
15745    solely relied on.  */
15746
15747 static bool
15748 aarch64_composite_type_p (const_tree type,
15749                           machine_mode mode)
15750 {
15751   if (aarch64_short_vector_p (type, mode))
15752     return false;
15753
15754   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
15755     return true;
15756
15757   if (mode == BLKmode
15758       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
15759       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
15760     return true;
15761
15762   return false;
15763 }
15764
15765 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
15766    shall be passed or returned in simd/fp register(s) (providing these
15767    parameter passing registers are available).
15768
15769    Upon successful return, *COUNT returns the number of needed registers,
15770    *BASE_MODE returns the mode of the individual register and when IS_HAF
15771    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
15772    floating-point aggregate or a homogeneous short-vector aggregate.  */
15773
15774 static bool
15775 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
15776                                          const_tree type,
15777                                          machine_mode *base_mode,
15778                                          int *count,
15779                                          bool *is_ha)
15780 {
15781   if (is_ha != NULL) *is_ha = false;
15782
15783   if (type && aarch64_sve::builtin_type_p (type))
15784     return false;
15785
15786   machine_mode new_mode = VOIDmode;
15787   bool composite_p = aarch64_composite_type_p (type, mode);
15788
15789   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
15790       || aarch64_short_vector_p (type, mode))
15791     {
15792       *count = 1;
15793       new_mode = mode;
15794     }
15795   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
15796     {
15797       if (is_ha != NULL) *is_ha = true;
15798       *count = 2;
15799       new_mode = GET_MODE_INNER (mode);
15800     }
15801   else if (type && composite_p)
15802     {
15803       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
15804
15805       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
15806         {
15807           if (is_ha != NULL) *is_ha = true;
15808           *count = ag_count;
15809         }
15810       else
15811         return false;
15812     }
15813   else
15814     return false;
15815
15816   *base_mode = new_mode;
15817   return true;
15818 }
15819
15820 /* Implement TARGET_STRUCT_VALUE_RTX.  */
15821
15822 static rtx
15823 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
15824                           int incoming ATTRIBUTE_UNUSED)
15825 {
15826   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
15827 }
15828
15829 /* Implements target hook vector_mode_supported_p.  */
15830 static bool
15831 aarch64_vector_mode_supported_p (machine_mode mode)
15832 {
15833   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15834   return vec_flags != 0 && (vec_flags & (VEC_STRUCT | VEC_PARTIAL)) == 0;
15835 }
15836
15837 /* Return the full-width SVE vector mode for element mode MODE, if one
15838    exists.  */
15839 opt_machine_mode
15840 aarch64_full_sve_mode (scalar_mode mode)
15841 {
15842   switch (mode)
15843     {
15844     case E_DFmode:
15845       return VNx2DFmode;
15846     case E_SFmode:
15847       return VNx4SFmode;
15848     case E_HFmode:
15849       return VNx8HFmode;
15850     case E_DImode:
15851         return VNx2DImode;
15852     case E_SImode:
15853       return VNx4SImode;
15854     case E_HImode:
15855       return VNx8HImode;
15856     case E_QImode:
15857       return VNx16QImode;
15858     default:
15859       return opt_machine_mode ();
15860     }
15861 }
15862
15863 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
15864    if it exists.  */
15865 opt_machine_mode
15866 aarch64_vq_mode (scalar_mode mode)
15867 {
15868   switch (mode)
15869     {
15870     case E_DFmode:
15871       return V2DFmode;
15872     case E_SFmode:
15873       return V4SFmode;
15874     case E_HFmode:
15875       return V8HFmode;
15876     case E_SImode:
15877       return V4SImode;
15878     case E_HImode:
15879       return V8HImode;
15880     case E_QImode:
15881       return V16QImode;
15882     case E_DImode:
15883       return V2DImode;
15884     default:
15885       return opt_machine_mode ();
15886     }
15887 }
15888
15889 /* Return appropriate SIMD container
15890    for MODE within a vector of WIDTH bits.  */
15891 static machine_mode
15892 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
15893 {
15894   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
15895     return aarch64_full_sve_mode (mode).else_mode (word_mode);
15896
15897   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
15898   if (TARGET_SIMD)
15899     {
15900       if (known_eq (width, 128))
15901         return aarch64_vq_mode (mode).else_mode (word_mode);
15902       else
15903         switch (mode)
15904           {
15905           case E_SFmode:
15906             return V2SFmode;
15907           case E_HFmode:
15908             return V4HFmode;
15909           case E_SImode:
15910             return V2SImode;
15911           case E_HImode:
15912             return V4HImode;
15913           case E_QImode:
15914             return V8QImode;
15915           default:
15916             break;
15917           }
15918     }
15919   return word_mode;
15920 }
15921
15922 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
15923 static machine_mode
15924 aarch64_preferred_simd_mode (scalar_mode mode)
15925 {
15926   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
15927   return aarch64_simd_container_mode (mode, bits);
15928 }
15929
15930 /* Return a list of possible vector sizes for the vectorizer
15931    to iterate over.  */
15932 static void
15933 aarch64_autovectorize_vector_sizes (vector_sizes *sizes, bool)
15934 {
15935   if (TARGET_SVE)
15936     sizes->safe_push (BYTES_PER_SVE_VECTOR);
15937   sizes->safe_push (16);
15938   sizes->safe_push (8);
15939 }
15940
15941 /* Implement TARGET_MANGLE_TYPE.  */
15942
15943 static const char *
15944 aarch64_mangle_type (const_tree type)
15945 {
15946   /* The AArch64 ABI documents say that "__va_list" has to be
15947      mangled as if it is in the "std" namespace.  */
15948   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
15949     return "St9__va_list";
15950
15951   /* Half-precision float.  */
15952   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
15953     return "Dh";
15954
15955   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
15956      builtin types.  */
15957   if (TYPE_NAME (type) != NULL)
15958     {
15959       const char *res;
15960       if ((res = aarch64_general_mangle_builtin_type (type))
15961           || (res = aarch64_sve::mangle_builtin_type (type)))
15962         return res;
15963     }
15964
15965   /* Use the default mangling.  */
15966   return NULL;
15967 }
15968
15969 /* Find the first rtx_insn before insn that will generate an assembly
15970    instruction.  */
15971
15972 static rtx_insn *
15973 aarch64_prev_real_insn (rtx_insn *insn)
15974 {
15975   if (!insn)
15976     return NULL;
15977
15978   do
15979     {
15980       insn = prev_real_insn (insn);
15981     }
15982   while (insn && recog_memoized (insn) < 0);
15983
15984   return insn;
15985 }
15986
15987 static bool
15988 is_madd_op (enum attr_type t1)
15989 {
15990   unsigned int i;
15991   /* A number of these may be AArch32 only.  */
15992   enum attr_type mlatypes[] = {
15993     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
15994     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
15995     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
15996   };
15997
15998   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
15999     {
16000       if (t1 == mlatypes[i])
16001         return true;
16002     }
16003
16004   return false;
16005 }
16006
16007 /* Check if there is a register dependency between a load and the insn
16008    for which we hold recog_data.  */
16009
16010 static bool
16011 dep_between_memop_and_curr (rtx memop)
16012 {
16013   rtx load_reg;
16014   int opno;
16015
16016   gcc_assert (GET_CODE (memop) == SET);
16017
16018   if (!REG_P (SET_DEST (memop)))
16019     return false;
16020
16021   load_reg = SET_DEST (memop);
16022   for (opno = 1; opno < recog_data.n_operands; opno++)
16023     {
16024       rtx operand = recog_data.operand[opno];
16025       if (REG_P (operand)
16026           && reg_overlap_mentioned_p (load_reg, operand))
16027         return true;
16028
16029     }
16030   return false;
16031 }
16032
16033
16034 /* When working around the Cortex-A53 erratum 835769,
16035    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
16036    instruction and has a preceding memory instruction such that a NOP
16037    should be inserted between them.  */
16038
16039 bool
16040 aarch64_madd_needs_nop (rtx_insn* insn)
16041 {
16042   enum attr_type attr_type;
16043   rtx_insn *prev;
16044   rtx body;
16045
16046   if (!TARGET_FIX_ERR_A53_835769)
16047     return false;
16048
16049   if (!INSN_P (insn) || recog_memoized (insn) < 0)
16050     return false;
16051
16052   attr_type = get_attr_type (insn);
16053   if (!is_madd_op (attr_type))
16054     return false;
16055
16056   prev = aarch64_prev_real_insn (insn);
16057   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
16058      Restore recog state to INSN to avoid state corruption.  */
16059   extract_constrain_insn_cached (insn);
16060
16061   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
16062     return false;
16063
16064   body = single_set (prev);
16065
16066   /* If the previous insn is a memory op and there is no dependency between
16067      it and the DImode madd, emit a NOP between them.  If body is NULL then we
16068      have a complex memory operation, probably a load/store pair.
16069      Be conservative for now and emit a NOP.  */
16070   if (GET_MODE (recog_data.operand[0]) == DImode
16071       && (!body || !dep_between_memop_and_curr (body)))
16072     return true;
16073
16074   return false;
16075
16076 }
16077
16078
16079 /* Implement FINAL_PRESCAN_INSN.  */
16080
16081 void
16082 aarch64_final_prescan_insn (rtx_insn *insn)
16083 {
16084   if (aarch64_madd_needs_nop (insn))
16085     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
16086 }
16087
16088
16089 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
16090    instruction.  */
16091
16092 bool
16093 aarch64_sve_index_immediate_p (rtx base_or_step)
16094 {
16095   return (CONST_INT_P (base_or_step)
16096           && IN_RANGE (INTVAL (base_or_step), -16, 15));
16097 }
16098
16099 /* Return true if X is a valid immediate for the SVE ADD and SUB
16100    instructions.  Negate X first if NEGATE_P is true.  */
16101
16102 bool
16103 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
16104 {
16105   rtx elt;
16106
16107   if (!const_vec_duplicate_p (x, &elt)
16108       || !CONST_INT_P (elt))
16109     return false;
16110
16111   HOST_WIDE_INT val = INTVAL (elt);
16112   if (negate_p)
16113     val = -val;
16114   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
16115
16116   if (val & 0xff)
16117     return IN_RANGE (val, 0, 0xff);
16118   return IN_RANGE (val, 0, 0xff00);
16119 }
16120
16121 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
16122    instructions.  Negate X first if NEGATE_P is true.  */
16123
16124 bool
16125 aarch64_sve_sqadd_sqsub_immediate_p (rtx x, bool negate_p)
16126 {
16127   rtx elt;
16128
16129   if (!const_vec_duplicate_p (x, &elt)
16130       || !CONST_INT_P (elt))
16131     return false;
16132
16133   if (!aarch64_sve_arith_immediate_p (x, negate_p))
16134     return false;
16135
16136   /* After the optional negation, the immediate must be nonnegative.
16137      E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
16138      instead of SQADD Zn.B, Zn.B, #129.  */
16139   return negate_p == (INTVAL (elt) < 0);
16140 }
16141
16142 /* Return true if X is a valid immediate operand for an SVE logical
16143    instruction such as AND.  */
16144
16145 bool
16146 aarch64_sve_bitmask_immediate_p (rtx x)
16147 {
16148   rtx elt;
16149
16150   return (const_vec_duplicate_p (x, &elt)
16151           && CONST_INT_P (elt)
16152           && aarch64_bitmask_imm (INTVAL (elt),
16153                                   GET_MODE_INNER (GET_MODE (x))));
16154 }
16155
16156 /* Return true if X is a valid immediate for the SVE DUP and CPY
16157    instructions.  */
16158
16159 bool
16160 aarch64_sve_dup_immediate_p (rtx x)
16161 {
16162   x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
16163   if (!CONST_INT_P (x))
16164     return false;
16165
16166   HOST_WIDE_INT val = INTVAL (x);
16167   if (val & 0xff)
16168     return IN_RANGE (val, -0x80, 0x7f);
16169   return IN_RANGE (val, -0x8000, 0x7f00);
16170 }
16171
16172 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
16173    SIGNED_P says whether the operand is signed rather than unsigned.  */
16174
16175 bool
16176 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
16177 {
16178   x = unwrap_const_vec_duplicate (x);
16179   return (CONST_INT_P (x)
16180           && (signed_p
16181               ? IN_RANGE (INTVAL (x), -16, 15)
16182               : IN_RANGE (INTVAL (x), 0, 127)));
16183 }
16184
16185 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
16186    instruction.  Negate X first if NEGATE_P is true.  */
16187
16188 bool
16189 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
16190 {
16191   rtx elt;
16192   REAL_VALUE_TYPE r;
16193
16194   if (!const_vec_duplicate_p (x, &elt)
16195       || GET_CODE (elt) != CONST_DOUBLE)
16196     return false;
16197
16198   r = *CONST_DOUBLE_REAL_VALUE (elt);
16199
16200   if (negate_p)
16201     r = real_value_negate (&r);
16202
16203   if (real_equal (&r, &dconst1))
16204     return true;
16205   if (real_equal (&r, &dconsthalf))
16206     return true;
16207   return false;
16208 }
16209
16210 /* Return true if X is a valid immediate operand for an SVE FMUL
16211    instruction.  */
16212
16213 bool
16214 aarch64_sve_float_mul_immediate_p (rtx x)
16215 {
16216   rtx elt;
16217
16218   return (const_vec_duplicate_p (x, &elt)
16219           && GET_CODE (elt) == CONST_DOUBLE
16220           && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
16221               || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
16222 }
16223
16224 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
16225    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
16226    is nonnull, use it to describe valid immediates.  */
16227 static bool
16228 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
16229                                     simd_immediate_info *info,
16230                                     enum simd_immediate_check which,
16231                                     simd_immediate_info::insn_type insn)
16232 {
16233   /* Try a 4-byte immediate with LSL.  */
16234   for (unsigned int shift = 0; shift < 32; shift += 8)
16235     if ((val32 & (0xff << shift)) == val32)
16236       {
16237         if (info)
16238           *info = simd_immediate_info (SImode, val32 >> shift, insn,
16239                                        simd_immediate_info::LSL, shift);
16240         return true;
16241       }
16242
16243   /* Try a 2-byte immediate with LSL.  */
16244   unsigned int imm16 = val32 & 0xffff;
16245   if (imm16 == (val32 >> 16))
16246     for (unsigned int shift = 0; shift < 16; shift += 8)
16247       if ((imm16 & (0xff << shift)) == imm16)
16248         {
16249           if (info)
16250             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
16251                                          simd_immediate_info::LSL, shift);
16252           return true;
16253         }
16254
16255   /* Try a 4-byte immediate with MSL, except for cases that MVN
16256      can handle.  */
16257   if (which == AARCH64_CHECK_MOV)
16258     for (unsigned int shift = 8; shift < 24; shift += 8)
16259       {
16260         unsigned int low = (1 << shift) - 1;
16261         if (((val32 & (0xff << shift)) | low) == val32)
16262           {
16263             if (info)
16264               *info = simd_immediate_info (SImode, val32 >> shift, insn,
16265                                            simd_immediate_info::MSL, shift);
16266             return true;
16267           }
16268       }
16269
16270   return false;
16271 }
16272
16273 /* Return true if replicating VAL64 is a valid immediate for the
16274    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
16275    use it to describe valid immediates.  */
16276 static bool
16277 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
16278                                  simd_immediate_info *info,
16279                                  enum simd_immediate_check which)
16280 {
16281   unsigned int val32 = val64 & 0xffffffff;
16282   unsigned int val16 = val64 & 0xffff;
16283   unsigned int val8 = val64 & 0xff;
16284
16285   if (val32 == (val64 >> 32))
16286     {
16287       if ((which & AARCH64_CHECK_ORR) != 0
16288           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
16289                                                  simd_immediate_info::MOV))
16290         return true;
16291
16292       if ((which & AARCH64_CHECK_BIC) != 0
16293           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
16294                                                  simd_immediate_info::MVN))
16295         return true;
16296
16297       /* Try using a replicated byte.  */
16298       if (which == AARCH64_CHECK_MOV
16299           && val16 == (val32 >> 16)
16300           && val8 == (val16 >> 8))
16301         {
16302           if (info)
16303             *info = simd_immediate_info (QImode, val8);
16304           return true;
16305         }
16306     }
16307
16308   /* Try using a bit-to-bytemask.  */
16309   if (which == AARCH64_CHECK_MOV)
16310     {
16311       unsigned int i;
16312       for (i = 0; i < 64; i += 8)
16313         {
16314           unsigned char byte = (val64 >> i) & 0xff;
16315           if (byte != 0 && byte != 0xff)
16316             break;
16317         }
16318       if (i == 64)
16319         {
16320           if (info)
16321             *info = simd_immediate_info (DImode, val64);
16322           return true;
16323         }
16324     }
16325   return false;
16326 }
16327
16328 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
16329    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
16330
16331 static bool
16332 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
16333                              simd_immediate_info *info)
16334 {
16335   scalar_int_mode mode = DImode;
16336   unsigned int val32 = val64 & 0xffffffff;
16337   if (val32 == (val64 >> 32))
16338     {
16339       mode = SImode;
16340       unsigned int val16 = val32 & 0xffff;
16341       if (val16 == (val32 >> 16))
16342         {
16343           mode = HImode;
16344           unsigned int val8 = val16 & 0xff;
16345           if (val8 == (val16 >> 8))
16346             mode = QImode;
16347         }
16348     }
16349   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
16350   if (IN_RANGE (val, -0x80, 0x7f))
16351     {
16352       /* DUP with no shift.  */
16353       if (info)
16354         *info = simd_immediate_info (mode, val);
16355       return true;
16356     }
16357   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
16358     {
16359       /* DUP with LSL #8.  */
16360       if (info)
16361         *info = simd_immediate_info (mode, val);
16362       return true;
16363     }
16364   if (aarch64_bitmask_imm (val64, mode))
16365     {
16366       /* DUPM.  */
16367       if (info)
16368         *info = simd_immediate_info (mode, val);
16369       return true;
16370     }
16371   return false;
16372 }
16373
16374 /* Return true if X is an UNSPEC_PTRUE constant of the form:
16375
16376        (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
16377
16378    where PATTERN is the svpattern as a CONST_INT and where ZERO
16379    is a zero constant of the required PTRUE mode (which can have
16380    fewer elements than X's mode, if zero bits are significant).
16381
16382    If so, and if INFO is nonnull, describe the immediate in INFO.  */
16383 bool
16384 aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
16385 {
16386   if (GET_CODE (x) != CONST)
16387     return false;
16388
16389   x = XEXP (x, 0);
16390   if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
16391     return false;
16392
16393   if (info)
16394     {
16395       aarch64_svpattern pattern
16396         = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
16397       machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
16398       scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
16399       *info = simd_immediate_info (int_mode, pattern);
16400     }
16401   return true;
16402 }
16403
16404 /* Return true if X is a valid SVE predicate.  If INFO is nonnull, use
16405    it to describe valid immediates.  */
16406
16407 static bool
16408 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
16409 {
16410   if (aarch64_sve_ptrue_svpattern_p (x, info))
16411     return true;
16412
16413   if (x == CONST0_RTX (GET_MODE (x)))
16414     {
16415       if (info)
16416         *info = simd_immediate_info (DImode, 0);
16417       return true;
16418     }
16419
16420   /* Analyze the value as a VNx16BImode.  This should be relatively
16421      efficient, since rtx_vector_builder has enough built-in capacity
16422      to store all VLA predicate constants without needing the heap.  */
16423   rtx_vector_builder builder;
16424   if (!aarch64_get_sve_pred_bits (builder, x))
16425     return false;
16426
16427   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
16428   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
16429     {
16430       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
16431       aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
16432       if (pattern != AARCH64_NUM_SVPATTERNS)
16433         {
16434           if (info)
16435             {
16436               scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
16437               *info = simd_immediate_info (int_mode, pattern);
16438             }
16439           return true;
16440         }
16441     }
16442   return false;
16443 }
16444
16445 /* Return true if OP is a valid SIMD immediate for the operation
16446    described by WHICH.  If INFO is nonnull, use it to describe valid
16447    immediates.  */
16448 bool
16449 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
16450                               enum simd_immediate_check which)
16451 {
16452   machine_mode mode = GET_MODE (op);
16453   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
16454   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
16455     return false;
16456
16457   if (vec_flags & VEC_SVE_PRED)
16458     return aarch64_sve_pred_valid_immediate (op, info);
16459
16460   scalar_mode elt_mode = GET_MODE_INNER (mode);
16461   rtx base, step;
16462   unsigned int n_elts;
16463   if (GET_CODE (op) == CONST_VECTOR
16464       && CONST_VECTOR_DUPLICATE_P (op))
16465     n_elts = CONST_VECTOR_NPATTERNS (op);
16466   else if ((vec_flags & VEC_SVE_DATA)
16467            && const_vec_series_p (op, &base, &step))
16468     {
16469       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
16470       if (!aarch64_sve_index_immediate_p (base)
16471           || !aarch64_sve_index_immediate_p (step))
16472         return false;
16473
16474       if (info)
16475         *info = simd_immediate_info (elt_mode, base, step);
16476       return true;
16477     }
16478   else if (GET_CODE (op) == CONST_VECTOR
16479            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
16480     /* N_ELTS set above.  */;
16481   else
16482     return false;
16483
16484   scalar_float_mode elt_float_mode;
16485   if (n_elts == 1
16486       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
16487     {
16488       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
16489       if (aarch64_float_const_zero_rtx_p (elt)
16490           || aarch64_float_const_representable_p (elt))
16491         {
16492           if (info)
16493             *info = simd_immediate_info (elt_float_mode, elt);
16494           return true;
16495         }
16496     }
16497
16498   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
16499   if (elt_size > 8)
16500     return false;
16501
16502   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
16503
16504   /* Expand the vector constant out into a byte vector, with the least
16505      significant byte of the register first.  */
16506   auto_vec<unsigned char, 16> bytes;
16507   bytes.reserve (n_elts * elt_size);
16508   for (unsigned int i = 0; i < n_elts; i++)
16509     {
16510       /* The vector is provided in gcc endian-neutral fashion.
16511          For aarch64_be Advanced SIMD, it must be laid out in the vector
16512          register in reverse order.  */
16513       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
16514       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
16515
16516       if (elt_mode != elt_int_mode)
16517         elt = gen_lowpart (elt_int_mode, elt);
16518
16519       if (!CONST_INT_P (elt))
16520         return false;
16521
16522       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
16523       for (unsigned int byte = 0; byte < elt_size; byte++)
16524         {
16525           bytes.quick_push (elt_val & 0xff);
16526           elt_val >>= BITS_PER_UNIT;
16527         }
16528     }
16529
16530   /* The immediate must repeat every eight bytes.  */
16531   unsigned int nbytes = bytes.length ();
16532   for (unsigned i = 8; i < nbytes; ++i)
16533     if (bytes[i] != bytes[i - 8])
16534       return false;
16535
16536   /* Get the repeating 8-byte value as an integer.  No endian correction
16537      is needed here because bytes is already in lsb-first order.  */
16538   unsigned HOST_WIDE_INT val64 = 0;
16539   for (unsigned int i = 0; i < 8; i++)
16540     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
16541               << (i * BITS_PER_UNIT));
16542
16543   if (vec_flags & VEC_SVE_DATA)
16544     return aarch64_sve_valid_immediate (val64, info);
16545   else
16546     return aarch64_advsimd_valid_immediate (val64, info, which);
16547 }
16548
16549 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
16550    has a step in the range of INDEX.  Return the index expression if so,
16551    otherwise return null.  */
16552 rtx
16553 aarch64_check_zero_based_sve_index_immediate (rtx x)
16554 {
16555   rtx base, step;
16556   if (const_vec_series_p (x, &base, &step)
16557       && base == const0_rtx
16558       && aarch64_sve_index_immediate_p (step))
16559     return step;
16560   return NULL_RTX;
16561 }
16562
16563 /* Check of immediate shift constants are within range.  */
16564 bool
16565 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
16566 {
16567   x = unwrap_const_vec_duplicate (x);
16568   if (!CONST_INT_P (x))
16569     return false;
16570   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
16571   if (left)
16572     return IN_RANGE (INTVAL (x), 0, bit_width - 1);
16573   else
16574     return IN_RANGE (INTVAL (x), 1, bit_width);
16575 }
16576
16577 /* Return the bitmask CONST_INT to select the bits required by a zero extract
16578    operation of width WIDTH at bit position POS.  */
16579
16580 rtx
16581 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
16582 {
16583   gcc_assert (CONST_INT_P (width));
16584   gcc_assert (CONST_INT_P (pos));
16585
16586   unsigned HOST_WIDE_INT mask
16587     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
16588   return GEN_INT (mask << UINTVAL (pos));
16589 }
16590
16591 bool
16592 aarch64_mov_operand_p (rtx x, machine_mode mode)
16593 {
16594   if (GET_CODE (x) == HIGH
16595       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
16596     return true;
16597
16598   if (CONST_INT_P (x))
16599     return true;
16600
16601   if (VECTOR_MODE_P (GET_MODE (x)))
16602     {
16603       /* Require predicate constants to be VNx16BI before RA, so that we
16604          force everything to have a canonical form.  */
16605       if (!lra_in_progress
16606           && !reload_completed
16607           && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
16608           && GET_MODE (x) != VNx16BImode)
16609         return false;
16610
16611       return aarch64_simd_valid_immediate (x, NULL);
16612     }
16613
16614   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
16615     return true;
16616
16617   if (aarch64_sve_cnt_immediate_p (x))
16618     return true;
16619
16620   return aarch64_classify_symbolic_expression (x)
16621     == SYMBOL_TINY_ABSOLUTE;
16622 }
16623
16624 /* Return a const_int vector of VAL.  */
16625 rtx
16626 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
16627 {
16628   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
16629   return gen_const_vec_duplicate (mode, c);
16630 }
16631
16632 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
16633
16634 bool
16635 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
16636 {
16637   machine_mode vmode;
16638
16639   vmode = aarch64_simd_container_mode (mode, 64);
16640   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
16641   return aarch64_simd_valid_immediate (op_v, NULL);
16642 }
16643
16644 /* Construct and return a PARALLEL RTX vector with elements numbering the
16645    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
16646    the vector - from the perspective of the architecture.  This does not
16647    line up with GCC's perspective on lane numbers, so we end up with
16648    different masks depending on our target endian-ness.  The diagram
16649    below may help.  We must draw the distinction when building masks
16650    which select one half of the vector.  An instruction selecting
16651    architectural low-lanes for a big-endian target, must be described using
16652    a mask selecting GCC high-lanes.
16653
16654                  Big-Endian             Little-Endian
16655
16656 GCC             0   1   2   3           3   2   1   0
16657               | x | x | x | x |       | x | x | x | x |
16658 Architecture    3   2   1   0           3   2   1   0
16659
16660 Low Mask:         { 2, 3 }                { 0, 1 }
16661 High Mask:        { 0, 1 }                { 2, 3 }
16662
16663    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
16664
16665 rtx
16666 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
16667 {
16668   rtvec v = rtvec_alloc (nunits / 2);
16669   int high_base = nunits / 2;
16670   int low_base = 0;
16671   int base;
16672   rtx t1;
16673   int i;
16674
16675   if (BYTES_BIG_ENDIAN)
16676     base = high ? low_base : high_base;
16677   else
16678     base = high ? high_base : low_base;
16679
16680   for (i = 0; i < nunits / 2; i++)
16681     RTVEC_ELT (v, i) = GEN_INT (base + i);
16682
16683   t1 = gen_rtx_PARALLEL (mode, v);
16684   return t1;
16685 }
16686
16687 /* Check OP for validity as a PARALLEL RTX vector with elements
16688    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
16689    from the perspective of the architecture.  See the diagram above
16690    aarch64_simd_vect_par_cnst_half for more details.  */
16691
16692 bool
16693 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
16694                                        bool high)
16695 {
16696   int nelts;
16697   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
16698     return false;
16699
16700   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
16701   HOST_WIDE_INT count_op = XVECLEN (op, 0);
16702   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
16703   int i = 0;
16704
16705   if (count_op != count_ideal)
16706     return false;
16707
16708   for (i = 0; i < count_ideal; i++)
16709     {
16710       rtx elt_op = XVECEXP (op, 0, i);
16711       rtx elt_ideal = XVECEXP (ideal, 0, i);
16712
16713       if (!CONST_INT_P (elt_op)
16714           || INTVAL (elt_ideal) != INTVAL (elt_op))
16715         return false;
16716     }
16717   return true;
16718 }
16719
16720 /* Return a PARALLEL containing NELTS elements, with element I equal
16721    to BASE + I * STEP.  */
16722
16723 rtx
16724 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
16725 {
16726   rtvec vec = rtvec_alloc (nelts);
16727   for (unsigned int i = 0; i < nelts; ++i)
16728     RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
16729   return gen_rtx_PARALLEL (VOIDmode, vec);
16730 }
16731
16732 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
16733    series with step STEP.  */
16734
16735 bool
16736 aarch64_stepped_int_parallel_p (rtx op, int step)
16737 {
16738   if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
16739     return false;
16740
16741   unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
16742   for (int i = 1; i < XVECLEN (op, 0); ++i)
16743     if (!CONST_INT_P (XVECEXP (op, 0, i))
16744         || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
16745       return false;
16746
16747   return true;
16748 }
16749
16750 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
16751    HIGH (exclusive).  */
16752 void
16753 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
16754                           const_tree exp)
16755 {
16756   HOST_WIDE_INT lane;
16757   gcc_assert (CONST_INT_P (operand));
16758   lane = INTVAL (operand);
16759
16760   if (lane < low || lane >= high)
16761   {
16762     if (exp)
16763       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
16764     else
16765       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
16766   }
16767 }
16768
16769 /* Peform endian correction on lane number N, which indexes a vector
16770    of mode MODE, and return the result as an SImode rtx.  */
16771
16772 rtx
16773 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
16774 {
16775   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
16776 }
16777
16778 /* Return TRUE if OP is a valid vector addressing mode.  */
16779
16780 bool
16781 aarch64_simd_mem_operand_p (rtx op)
16782 {
16783   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
16784                         || REG_P (XEXP (op, 0)));
16785 }
16786
16787 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
16788
16789 bool
16790 aarch64_sve_ld1r_operand_p (rtx op)
16791 {
16792   struct aarch64_address_info addr;
16793   scalar_mode mode;
16794
16795   return (MEM_P (op)
16796           && is_a <scalar_mode> (GET_MODE (op), &mode)
16797           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
16798           && addr.type == ADDRESS_REG_IMM
16799           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
16800 }
16801
16802 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
16803 bool
16804 aarch64_sve_ld1rq_operand_p (rtx op)
16805 {
16806   struct aarch64_address_info addr;
16807   scalar_mode elem_mode = GET_MODE_INNER (GET_MODE (op));
16808   if (!MEM_P (op)
16809       || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
16810     return false;
16811
16812   if (addr.type == ADDRESS_REG_IMM)
16813     return offset_4bit_signed_scaled_p (TImode, addr.const_offset);
16814
16815   if (addr.type == ADDRESS_REG_REG)
16816     return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
16817
16818   return false;
16819 }
16820
16821 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction.  */
16822 bool
16823 aarch64_sve_ldff1_operand_p (rtx op)
16824 {
16825   if (!MEM_P (op))
16826     return false;
16827
16828   struct aarch64_address_info addr;
16829   if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
16830     return false;
16831
16832   if (addr.type == ADDRESS_REG_IMM)
16833     return known_eq (addr.const_offset, 0);
16834
16835   return addr.type == ADDRESS_REG_REG;
16836 }
16837
16838 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction.  */
16839 bool
16840 aarch64_sve_ldnf1_operand_p (rtx op)
16841 {
16842   struct aarch64_address_info addr;
16843
16844   return (MEM_P (op)
16845           && aarch64_classify_address (&addr, XEXP (op, 0),
16846                                        GET_MODE (op), false)
16847           && addr.type == ADDRESS_REG_IMM);
16848 }
16849
16850 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
16851    The conditions for STR are the same.  */
16852 bool
16853 aarch64_sve_ldr_operand_p (rtx op)
16854 {
16855   struct aarch64_address_info addr;
16856
16857   return (MEM_P (op)
16858           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
16859                                        false, ADDR_QUERY_ANY)
16860           && addr.type == ADDRESS_REG_IMM);
16861 }
16862
16863 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
16864    addressing memory of mode MODE.  */
16865 bool
16866 aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
16867 {
16868   struct aarch64_address_info addr;
16869   if (!aarch64_classify_address (&addr, op, mode, false))
16870     return false;
16871
16872   if (addr.type == ADDRESS_REG_IMM)
16873     return known_eq (addr.const_offset, 0);
16874
16875   return addr.type == ADDRESS_REG_REG;
16876 }
16877
16878 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
16879    We need to be able to access the individual pieces, so the range
16880    is different from LD[234] and ST[234].  */
16881 bool
16882 aarch64_sve_struct_memory_operand_p (rtx op)
16883 {
16884   if (!MEM_P (op))
16885     return false;
16886
16887   machine_mode mode = GET_MODE (op);
16888   struct aarch64_address_info addr;
16889   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
16890                                  ADDR_QUERY_ANY)
16891       || addr.type != ADDRESS_REG_IMM)
16892     return false;
16893
16894   poly_int64 first = addr.const_offset;
16895   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
16896   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
16897           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
16898 }
16899
16900 /* Emit a register copy from operand to operand, taking care not to
16901    early-clobber source registers in the process.
16902
16903    COUNT is the number of components into which the copy needs to be
16904    decomposed.  */
16905 void
16906 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
16907                                 unsigned int count)
16908 {
16909   unsigned int i;
16910   int rdest = REGNO (operands[0]);
16911   int rsrc = REGNO (operands[1]);
16912
16913   if (!reg_overlap_mentioned_p (operands[0], operands[1])
16914       || rdest < rsrc)
16915     for (i = 0; i < count; i++)
16916       emit_move_insn (gen_rtx_REG (mode, rdest + i),
16917                       gen_rtx_REG (mode, rsrc + i));
16918   else
16919     for (i = 0; i < count; i++)
16920       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
16921                       gen_rtx_REG (mode, rsrc + count - i - 1));
16922 }
16923
16924 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
16925    one of VSTRUCT modes: OI, CI, or XI.  */
16926 int
16927 aarch64_simd_attr_length_rglist (machine_mode mode)
16928 {
16929   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
16930   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
16931 }
16932
16933 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
16934    alignment of a vector to 128 bits.  SVE predicates have an alignment of
16935    16 bits.  */
16936 static HOST_WIDE_INT
16937 aarch64_simd_vector_alignment (const_tree type)
16938 {
16939   /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
16940      be set for non-predicate vectors of booleans.  Modes are the most
16941      direct way we have of identifying real SVE predicate types.  */
16942   if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
16943     return 16;
16944   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
16945     return 128;
16946   return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
16947 }
16948
16949 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
16950 static poly_uint64
16951 aarch64_vectorize_preferred_vector_alignment (const_tree type)
16952 {
16953   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
16954     {
16955       /* If the length of the vector is fixed, try to align to that length,
16956          otherwise don't try to align at all.  */
16957       HOST_WIDE_INT result;
16958       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
16959         result = TYPE_ALIGN (TREE_TYPE (type));
16960       return result;
16961     }
16962   return TYPE_ALIGN (type);
16963 }
16964
16965 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
16966 static bool
16967 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
16968 {
16969   if (is_packed)
16970     return false;
16971
16972   /* For fixed-length vectors, check that the vectorizer will aim for
16973      full-vector alignment.  This isn't true for generic GCC vectors
16974      that are wider than the ABI maximum of 128 bits.  */
16975   poly_uint64 preferred_alignment =
16976     aarch64_vectorize_preferred_vector_alignment (type);
16977   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
16978       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
16979                    preferred_alignment))
16980     return false;
16981
16982   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
16983   return true;
16984 }
16985
16986 /* Return true if the vector misalignment factor is supported by the
16987    target.  */
16988 static bool
16989 aarch64_builtin_support_vector_misalignment (machine_mode mode,
16990                                              const_tree type, int misalignment,
16991                                              bool is_packed)
16992 {
16993   if (TARGET_SIMD && STRICT_ALIGNMENT)
16994     {
16995       /* Return if movmisalign pattern is not supported for this mode.  */
16996       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
16997         return false;
16998
16999       /* Misalignment factor is unknown at compile time.  */
17000       if (misalignment == -1)
17001         return false;
17002     }
17003   return default_builtin_support_vector_misalignment (mode, type, misalignment,
17004                                                       is_packed);
17005 }
17006
17007 /* If VALS is a vector constant that can be loaded into a register
17008    using DUP, generate instructions to do so and return an RTX to
17009    assign to the register.  Otherwise return NULL_RTX.  */
17010 static rtx
17011 aarch64_simd_dup_constant (rtx vals)
17012 {
17013   machine_mode mode = GET_MODE (vals);
17014   machine_mode inner_mode = GET_MODE_INNER (mode);
17015   rtx x;
17016
17017   if (!const_vec_duplicate_p (vals, &x))
17018     return NULL_RTX;
17019
17020   /* We can load this constant by using DUP and a constant in a
17021      single ARM register.  This will be cheaper than a vector
17022      load.  */
17023   x = copy_to_mode_reg (inner_mode, x);
17024   return gen_vec_duplicate (mode, x);
17025 }
17026
17027
17028 /* Generate code to load VALS, which is a PARALLEL containing only
17029    constants (for vec_init) or CONST_VECTOR, efficiently into a
17030    register.  Returns an RTX to copy into the register, or NULL_RTX
17031    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
17032 static rtx
17033 aarch64_simd_make_constant (rtx vals)
17034 {
17035   machine_mode mode = GET_MODE (vals);
17036   rtx const_dup;
17037   rtx const_vec = NULL_RTX;
17038   int n_const = 0;
17039   int i;
17040
17041   if (GET_CODE (vals) == CONST_VECTOR)
17042     const_vec = vals;
17043   else if (GET_CODE (vals) == PARALLEL)
17044     {
17045       /* A CONST_VECTOR must contain only CONST_INTs and
17046          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
17047          Only store valid constants in a CONST_VECTOR.  */
17048       int n_elts = XVECLEN (vals, 0);
17049       for (i = 0; i < n_elts; ++i)
17050         {
17051           rtx x = XVECEXP (vals, 0, i);
17052           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
17053             n_const++;
17054         }
17055       if (n_const == n_elts)
17056         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
17057     }
17058   else
17059     gcc_unreachable ();
17060
17061   if (const_vec != NULL_RTX
17062       && aarch64_simd_valid_immediate (const_vec, NULL))
17063     /* Load using MOVI/MVNI.  */
17064     return const_vec;
17065   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
17066     /* Loaded using DUP.  */
17067     return const_dup;
17068   else if (const_vec != NULL_RTX)
17069     /* Load from constant pool. We cannot take advantage of single-cycle
17070        LD1 because we need a PC-relative addressing mode.  */
17071     return const_vec;
17072   else
17073     /* A PARALLEL containing something not valid inside CONST_VECTOR.
17074        We cannot construct an initializer.  */
17075     return NULL_RTX;
17076 }
17077
17078 /* Expand a vector initialisation sequence, such that TARGET is
17079    initialised to contain VALS.  */
17080
17081 void
17082 aarch64_expand_vector_init (rtx target, rtx vals)
17083 {
17084   machine_mode mode = GET_MODE (target);
17085   scalar_mode inner_mode = GET_MODE_INNER (mode);
17086   /* The number of vector elements.  */
17087   int n_elts = XVECLEN (vals, 0);
17088   /* The number of vector elements which are not constant.  */
17089   int n_var = 0;
17090   rtx any_const = NULL_RTX;
17091   /* The first element of vals.  */
17092   rtx v0 = XVECEXP (vals, 0, 0);
17093   bool all_same = true;
17094
17095   /* This is a special vec_init<M><N> where N is not an element mode but a
17096      vector mode with half the elements of M.  We expect to find two entries
17097      of mode N in VALS and we must put their concatentation into TARGET.  */
17098   if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
17099     {
17100       gcc_assert (known_eq (GET_MODE_SIZE (mode),
17101                   2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
17102       rtx lo = XVECEXP (vals, 0, 0);
17103       rtx hi = XVECEXP (vals, 0, 1);
17104       machine_mode narrow_mode = GET_MODE (lo);
17105       gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
17106       gcc_assert (narrow_mode == GET_MODE (hi));
17107
17108       /* When we want to concatenate a half-width vector with zeroes we can
17109          use the aarch64_combinez[_be] patterns.  Just make sure that the
17110          zeroes are in the right half.  */
17111       if (BYTES_BIG_ENDIAN
17112           && aarch64_simd_imm_zero (lo, narrow_mode)
17113           && general_operand (hi, narrow_mode))
17114         emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
17115       else if (!BYTES_BIG_ENDIAN
17116                && aarch64_simd_imm_zero (hi, narrow_mode)
17117                && general_operand (lo, narrow_mode))
17118         emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
17119       else
17120         {
17121           /* Else create the two half-width registers and combine them.  */
17122           if (!REG_P (lo))
17123             lo = force_reg (GET_MODE (lo), lo);
17124           if (!REG_P (hi))
17125             hi = force_reg (GET_MODE (hi), hi);
17126
17127           if (BYTES_BIG_ENDIAN)
17128             std::swap (lo, hi);
17129           emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
17130         }
17131      return;
17132    }
17133
17134   /* Count the number of variable elements to initialise.  */
17135   for (int i = 0; i < n_elts; ++i)
17136     {
17137       rtx x = XVECEXP (vals, 0, i);
17138       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
17139         ++n_var;
17140       else
17141         any_const = x;
17142
17143       all_same &= rtx_equal_p (x, v0);
17144     }
17145
17146   /* No variable elements, hand off to aarch64_simd_make_constant which knows
17147      how best to handle this.  */
17148   if (n_var == 0)
17149     {
17150       rtx constant = aarch64_simd_make_constant (vals);
17151       if (constant != NULL_RTX)
17152         {
17153           emit_move_insn (target, constant);
17154           return;
17155         }
17156     }
17157
17158   /* Splat a single non-constant element if we can.  */
17159   if (all_same)
17160     {
17161       rtx x = copy_to_mode_reg (inner_mode, v0);
17162       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
17163       return;
17164     }
17165
17166   enum insn_code icode = optab_handler (vec_set_optab, mode);
17167   gcc_assert (icode != CODE_FOR_nothing);
17168
17169   /* If there are only variable elements, try to optimize
17170      the insertion using dup for the most common element
17171      followed by insertions.  */
17172
17173   /* The algorithm will fill matches[*][0] with the earliest matching element,
17174      and matches[X][1] with the count of duplicate elements (if X is the
17175      earliest element which has duplicates).  */
17176
17177   if (n_var == n_elts && n_elts <= 16)
17178     {
17179       int matches[16][2] = {0};
17180       for (int i = 0; i < n_elts; i++)
17181         {
17182           for (int j = 0; j <= i; j++)
17183             {
17184               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
17185                 {
17186                   matches[i][0] = j;
17187                   matches[j][1]++;
17188                   break;
17189                 }
17190             }
17191         }
17192       int maxelement = 0;
17193       int maxv = 0;
17194       for (int i = 0; i < n_elts; i++)
17195         if (matches[i][1] > maxv)
17196           {
17197             maxelement = i;
17198             maxv = matches[i][1];
17199           }
17200
17201       /* Create a duplicate of the most common element, unless all elements
17202          are equally useless to us, in which case just immediately set the
17203          vector register using the first element.  */
17204
17205       if (maxv == 1)
17206         {
17207           /* For vectors of two 64-bit elements, we can do even better.  */
17208           if (n_elts == 2
17209               && (inner_mode == E_DImode
17210                   || inner_mode == E_DFmode))
17211
17212             {
17213               rtx x0 = XVECEXP (vals, 0, 0);
17214               rtx x1 = XVECEXP (vals, 0, 1);
17215               /* Combine can pick up this case, but handling it directly
17216                  here leaves clearer RTL.
17217
17218                  This is load_pair_lanes<mode>, and also gives us a clean-up
17219                  for store_pair_lanes<mode>.  */
17220               if (memory_operand (x0, inner_mode)
17221                   && memory_operand (x1, inner_mode)
17222                   && !STRICT_ALIGNMENT
17223                   && rtx_equal_p (XEXP (x1, 0),
17224                                   plus_constant (Pmode,
17225                                                  XEXP (x0, 0),
17226                                                  GET_MODE_SIZE (inner_mode))))
17227                 {
17228                   rtx t;
17229                   if (inner_mode == DFmode)
17230                     t = gen_load_pair_lanesdf (target, x0, x1);
17231                   else
17232                     t = gen_load_pair_lanesdi (target, x0, x1);
17233                   emit_insn (t);
17234                   return;
17235                 }
17236             }
17237           /* The subreg-move sequence below will move into lane zero of the
17238              vector register.  For big-endian we want that position to hold
17239              the last element of VALS.  */
17240           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
17241           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
17242           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
17243         }
17244       else
17245         {
17246           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
17247           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
17248         }
17249
17250       /* Insert the rest.  */
17251       for (int i = 0; i < n_elts; i++)
17252         {
17253           rtx x = XVECEXP (vals, 0, i);
17254           if (matches[i][0] == maxelement)
17255             continue;
17256           x = copy_to_mode_reg (inner_mode, x);
17257           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
17258         }
17259       return;
17260     }
17261
17262   /* Initialise a vector which is part-variable.  We want to first try
17263      to build those lanes which are constant in the most efficient way we
17264      can.  */
17265   if (n_var != n_elts)
17266     {
17267       rtx copy = copy_rtx (vals);
17268
17269       /* Load constant part of vector.  We really don't care what goes into the
17270          parts we will overwrite, but we're more likely to be able to load the
17271          constant efficiently if it has fewer, larger, repeating parts
17272          (see aarch64_simd_valid_immediate).  */
17273       for (int i = 0; i < n_elts; i++)
17274         {
17275           rtx x = XVECEXP (vals, 0, i);
17276           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
17277             continue;
17278           rtx subst = any_const;
17279           for (int bit = n_elts / 2; bit > 0; bit /= 2)
17280             {
17281               /* Look in the copied vector, as more elements are const.  */
17282               rtx test = XVECEXP (copy, 0, i ^ bit);
17283               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
17284                 {
17285                   subst = test;
17286                   break;
17287                 }
17288             }
17289           XVECEXP (copy, 0, i) = subst;
17290         }
17291       aarch64_expand_vector_init (target, copy);
17292     }
17293
17294   /* Insert the variable lanes directly.  */
17295   for (int i = 0; i < n_elts; i++)
17296     {
17297       rtx x = XVECEXP (vals, 0, i);
17298       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
17299         continue;
17300       x = copy_to_mode_reg (inner_mode, x);
17301       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
17302     }
17303 }
17304
17305 /* Emit RTL corresponding to:
17306    insr TARGET, ELEM.  */
17307
17308 static void
17309 emit_insr (rtx target, rtx elem)
17310 {
17311   machine_mode mode = GET_MODE (target);
17312   scalar_mode elem_mode = GET_MODE_INNER (mode);
17313   elem = force_reg (elem_mode, elem);
17314
17315   insn_code icode = optab_handler (vec_shl_insert_optab, mode);
17316   gcc_assert (icode != CODE_FOR_nothing);
17317   emit_insn (GEN_FCN (icode) (target, target, elem));
17318 }
17319
17320 /* Subroutine of aarch64_sve_expand_vector_init for handling
17321    trailing constants.
17322    This function works as follows:
17323    (a) Create a new vector consisting of trailing constants.
17324    (b) Initialize TARGET with the constant vector using emit_move_insn.
17325    (c) Insert remaining elements in TARGET using insr.
17326    NELTS is the total number of elements in original vector while
17327    while NELTS_REQD is the number of elements that are actually
17328    significant.
17329
17330    ??? The heuristic used is to do above only if number of constants
17331    is at least half the total number of elements.  May need fine tuning.  */
17332
17333 static bool
17334 aarch64_sve_expand_vector_init_handle_trailing_constants
17335  (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
17336 {
17337   machine_mode mode = GET_MODE (target);
17338   scalar_mode elem_mode = GET_MODE_INNER (mode);
17339   int n_trailing_constants = 0;
17340
17341   for (int i = nelts_reqd - 1;
17342        i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
17343        i--)
17344     n_trailing_constants++;
17345
17346   if (n_trailing_constants >= nelts_reqd / 2)
17347     {
17348       rtx_vector_builder v (mode, 1, nelts);
17349       for (int i = 0; i < nelts; i++)
17350         v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
17351       rtx const_vec = v.build ();
17352       emit_move_insn (target, const_vec);
17353
17354       for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
17355         emit_insr (target, builder.elt (i));
17356
17357       return true;
17358     }
17359
17360   return false;
17361 }
17362
17363 /* Subroutine of aarch64_sve_expand_vector_init.
17364    Works as follows:
17365    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
17366    (b) Skip trailing elements from BUILDER, which are the same as
17367        element NELTS_REQD - 1.
17368    (c) Insert earlier elements in reverse order in TARGET using insr.  */
17369
17370 static void
17371 aarch64_sve_expand_vector_init_insert_elems (rtx target,
17372                                              const rtx_vector_builder &builder,
17373                                              int nelts_reqd)
17374 {
17375   machine_mode mode = GET_MODE (target);
17376   scalar_mode elem_mode = GET_MODE_INNER (mode);
17377
17378   struct expand_operand ops[2];
17379   enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
17380   gcc_assert (icode != CODE_FOR_nothing);
17381
17382   create_output_operand (&ops[0], target, mode);
17383   create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
17384   expand_insn (icode, 2, ops);
17385
17386   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
17387   for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
17388     emit_insr (target, builder.elt (i));
17389 }
17390
17391 /* Subroutine of aarch64_sve_expand_vector_init to handle case
17392    when all trailing elements of builder are same.
17393    This works as follows:
17394    (a) Use expand_insn interface to broadcast last vector element in TARGET.
17395    (b) Insert remaining elements in TARGET using insr.
17396
17397    ??? The heuristic used is to do above if number of same trailing elements
17398    is at least 3/4 of total number of elements, loosely based on
17399    heuristic from mostly_zeros_p.  May need fine-tuning.  */
17400
17401 static bool
17402 aarch64_sve_expand_vector_init_handle_trailing_same_elem
17403  (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
17404 {
17405   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
17406   if (ndups >= (3 * nelts_reqd) / 4)
17407     {
17408       aarch64_sve_expand_vector_init_insert_elems (target, builder,
17409                                                    nelts_reqd - ndups + 1);
17410       return true;
17411     }
17412
17413   return false;
17414 }
17415
17416 /* Initialize register TARGET from BUILDER. NELTS is the constant number
17417    of elements in BUILDER.
17418
17419    The function tries to initialize TARGET from BUILDER if it fits one
17420    of the special cases outlined below.
17421
17422    Failing that, the function divides BUILDER into two sub-vectors:
17423    v_even = even elements of BUILDER;
17424    v_odd = odd elements of BUILDER;
17425
17426    and recursively calls itself with v_even and v_odd.
17427
17428    if (recursive call succeeded for v_even or v_odd)
17429      TARGET = zip (v_even, v_odd)
17430
17431    The function returns true if it managed to build TARGET from BUILDER
17432    with one of the special cases, false otherwise.
17433
17434    Example: {a, 1, b, 2, c, 3, d, 4}
17435
17436    The vector gets divided into:
17437    v_even = {a, b, c, d}
17438    v_odd = {1, 2, 3, 4}
17439
17440    aarch64_sve_expand_vector_init(v_odd) hits case 1 and
17441    initialize tmp2 from constant vector v_odd using emit_move_insn.
17442
17443    aarch64_sve_expand_vector_init(v_even) fails since v_even contains
17444    4 elements, so we construct tmp1 from v_even using insr:
17445    tmp1 = dup(d)
17446    insr tmp1, c
17447    insr tmp1, b
17448    insr tmp1, a
17449
17450    And finally:
17451    TARGET = zip (tmp1, tmp2)
17452    which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
17453
17454 static bool
17455 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
17456                                 int nelts, int nelts_reqd)
17457 {
17458   machine_mode mode = GET_MODE (target);
17459
17460   /* Case 1: Vector contains trailing constants.  */
17461
17462   if (aarch64_sve_expand_vector_init_handle_trailing_constants
17463        (target, builder, nelts, nelts_reqd))
17464     return true;
17465
17466   /* Case 2: Vector contains leading constants.  */
17467
17468   rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
17469   for (int i = 0; i < nelts_reqd; i++)
17470     rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
17471   rev_builder.finalize ();
17472
17473   if (aarch64_sve_expand_vector_init_handle_trailing_constants
17474        (target, rev_builder, nelts, nelts_reqd))
17475     {
17476       emit_insn (gen_aarch64_sve_rev (mode, target, target));
17477       return true;
17478     }
17479
17480   /* Case 3: Vector contains trailing same element.  */
17481
17482   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
17483        (target, builder, nelts_reqd))
17484     return true;
17485
17486   /* Case 4: Vector contains leading same element.  */
17487
17488   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
17489        (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
17490     {
17491       emit_insn (gen_aarch64_sve_rev (mode, target, target));
17492       return true;
17493     }
17494
17495   /* Avoid recursing below 4-elements.
17496      ??? The threshold 4 may need fine-tuning.  */
17497
17498   if (nelts_reqd <= 4)
17499     return false;
17500
17501   rtx_vector_builder v_even (mode, 1, nelts);
17502   rtx_vector_builder v_odd (mode, 1, nelts);
17503
17504   for (int i = 0; i < nelts * 2; i += 2)
17505     {
17506       v_even.quick_push (builder.elt (i));
17507       v_odd.quick_push (builder.elt (i + 1));
17508     }
17509
17510   v_even.finalize ();
17511   v_odd.finalize ();
17512
17513   rtx tmp1 = gen_reg_rtx (mode);
17514   bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
17515                                                     nelts, nelts_reqd / 2);
17516
17517   rtx tmp2 = gen_reg_rtx (mode);
17518   bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
17519                                                    nelts, nelts_reqd / 2);
17520
17521   if (!did_even_p && !did_odd_p)
17522     return false;
17523
17524   /* Initialize v_even and v_odd using INSR if it didn't match any of the
17525      special cases and zip v_even, v_odd.  */
17526
17527   if (!did_even_p)
17528     aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
17529
17530   if (!did_odd_p)
17531     aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
17532
17533   rtvec v = gen_rtvec (2, tmp1, tmp2);
17534   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
17535   return true;
17536 }
17537
17538 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
17539
17540 void
17541 aarch64_sve_expand_vector_init (rtx target, rtx vals)
17542 {
17543   machine_mode mode = GET_MODE (target);
17544   int nelts = XVECLEN (vals, 0);
17545
17546   rtx_vector_builder v (mode, 1, nelts);
17547   for (int i = 0; i < nelts; i++)
17548     v.quick_push (XVECEXP (vals, 0, i));
17549   v.finalize ();
17550
17551   /* If neither sub-vectors of v could be initialized specially,
17552      then use INSR to insert all elements from v into TARGET.
17553      ??? This might not be optimal for vectors with large
17554      initializers like 16-element or above.
17555      For nelts < 4, it probably isn't useful to handle specially.  */
17556
17557   if (nelts < 4
17558       || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
17559     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
17560 }
17561
17562 /* Check whether VALUE is a vector constant in which every element
17563    is either a power of 2 or a negated power of 2.  If so, return
17564    a constant vector of log2s, and flip CODE between PLUS and MINUS
17565    if VALUE contains negated powers of 2.  Return NULL_RTX otherwise.  */
17566
17567 static rtx
17568 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
17569 {
17570   if (GET_CODE (value) != CONST_VECTOR)
17571     return NULL_RTX;
17572
17573   rtx_vector_builder builder;
17574   if (!builder.new_unary_operation (GET_MODE (value), value, false))
17575     return NULL_RTX;
17576
17577   scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
17578   /* 1 if the result of the multiplication must be negated,
17579      0 if it mustn't, or -1 if we don't yet care.  */
17580   int negate = -1;
17581   unsigned int encoded_nelts = const_vector_encoded_nelts (value);
17582   for (unsigned int i = 0; i < encoded_nelts; ++i)
17583     {
17584       rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
17585       if (!CONST_SCALAR_INT_P (elt))
17586         return NULL_RTX;
17587       rtx_mode_t val (elt, int_mode);
17588       wide_int pow2 = wi::neg (val);
17589       if (val != pow2)
17590         {
17591           /* It matters whether we negate or not.  Make that choice,
17592              and make sure that it's consistent with previous elements.  */
17593           if (negate == !wi::neg_p (val))
17594             return NULL_RTX;
17595           negate = wi::neg_p (val);
17596           if (!negate)
17597             pow2 = val;
17598         }
17599       /* POW2 is now the value that we want to be a power of 2.  */
17600       int shift = wi::exact_log2 (pow2);
17601       if (shift < 0)
17602         return NULL_RTX;
17603       builder.quick_push (gen_int_mode (shift, int_mode));
17604     }
17605   if (negate == -1)
17606     /* PLUS and MINUS are equivalent; canonicalize on PLUS.  */
17607     code = PLUS;
17608   else if (negate == 1)
17609     code = code == PLUS ? MINUS : PLUS;
17610   return builder.build ();
17611 }
17612
17613 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
17614    CODE is PLUS for the former and MINUS for the latter.  OPERANDS is the
17615    operands array, in the same order as for fma_optab.  Return true if
17616    the function emitted all the necessary instructions, false if the caller
17617    should generate the pattern normally with the new OPERANDS array.  */
17618
17619 bool
17620 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
17621 {
17622   machine_mode mode = GET_MODE (operands[0]);
17623   if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
17624     {
17625       rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
17626                                   NULL_RTX, true, OPTAB_DIRECT);
17627       force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
17628                           operands[3], product, operands[0], true,
17629                           OPTAB_DIRECT);
17630       return true;
17631     }
17632   operands[2] = force_reg (mode, operands[2]);
17633   return false;
17634 }
17635
17636 /* Likewise, but for a conditional pattern.  */
17637
17638 bool
17639 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
17640 {
17641   machine_mode mode = GET_MODE (operands[0]);
17642   if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
17643     {
17644       rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
17645                                   NULL_RTX, true, OPTAB_DIRECT);
17646       emit_insn (gen_cond (code, mode, operands[0], operands[1],
17647                            operands[4], product, operands[5]));
17648       return true;
17649     }
17650   operands[3] = force_reg (mode, operands[3]);
17651   return false;
17652 }
17653
17654 static unsigned HOST_WIDE_INT
17655 aarch64_shift_truncation_mask (machine_mode mode)
17656 {
17657   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
17658     return 0;
17659   return GET_MODE_UNIT_BITSIZE (mode) - 1;
17660 }
17661
17662 /* Select a format to encode pointers in exception handling data.  */
17663 int
17664 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
17665 {
17666    int type;
17667    switch (aarch64_cmodel)
17668      {
17669      case AARCH64_CMODEL_TINY:
17670      case AARCH64_CMODEL_TINY_PIC:
17671      case AARCH64_CMODEL_SMALL:
17672      case AARCH64_CMODEL_SMALL_PIC:
17673      case AARCH64_CMODEL_SMALL_SPIC:
17674        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
17675           for everything.  */
17676        type = DW_EH_PE_sdata4;
17677        break;
17678      default:
17679        /* No assumptions here.  8-byte relocs required.  */
17680        type = DW_EH_PE_sdata8;
17681        break;
17682      }
17683    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
17684 }
17685
17686 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
17687
17688 static void
17689 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
17690 {
17691   if (TREE_CODE (decl) == FUNCTION_DECL)
17692     {
17693       arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
17694       if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
17695         {
17696           fprintf (stream, "\t.variant_pcs\t");
17697           assemble_name (stream, name);
17698           fprintf (stream, "\n");
17699         }
17700     }
17701 }
17702
17703 /* The last .arch and .tune assembly strings that we printed.  */
17704 static std::string aarch64_last_printed_arch_string;
17705 static std::string aarch64_last_printed_tune_string;
17706
17707 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
17708    by the function fndecl.  */
17709
17710 void
17711 aarch64_declare_function_name (FILE *stream, const char* name,
17712                                 tree fndecl)
17713 {
17714   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
17715
17716   struct cl_target_option *targ_options;
17717   if (target_parts)
17718     targ_options = TREE_TARGET_OPTION (target_parts);
17719   else
17720     targ_options = TREE_TARGET_OPTION (target_option_current_node);
17721   gcc_assert (targ_options);
17722
17723   const struct processor *this_arch
17724     = aarch64_get_arch (targ_options->x_explicit_arch);
17725
17726   uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
17727   std::string extension
17728     = aarch64_get_extension_string_for_isa_flags (isa_flags,
17729                                                   this_arch->flags);
17730   /* Only update the assembler .arch string if it is distinct from the last
17731      such string we printed.  */
17732   std::string to_print = this_arch->name + extension;
17733   if (to_print != aarch64_last_printed_arch_string)
17734     {
17735       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
17736       aarch64_last_printed_arch_string = to_print;
17737     }
17738
17739   /* Print the cpu name we're tuning for in the comments, might be
17740      useful to readers of the generated asm.  Do it only when it changes
17741      from function to function and verbose assembly is requested.  */
17742   const struct processor *this_tune
17743     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
17744
17745   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
17746     {
17747       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
17748                    this_tune->name);
17749       aarch64_last_printed_tune_string = this_tune->name;
17750     }
17751
17752   aarch64_asm_output_variant_pcs (stream, fndecl, name);
17753
17754   /* Don't forget the type directive for ELF.  */
17755   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
17756   ASM_OUTPUT_LABEL (stream, name);
17757 }
17758
17759 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
17760
17761 void
17762 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
17763 {
17764   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
17765   const char *value = IDENTIFIER_POINTER (target);
17766   aarch64_asm_output_variant_pcs (stream, decl, name);
17767   ASM_OUTPUT_DEF (stream, name, value);
17768 }
17769
17770 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
17771    function symbol references.  */
17772
17773 void
17774 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
17775 {
17776   default_elf_asm_output_external (stream, decl, name);
17777   aarch64_asm_output_variant_pcs (stream, decl, name);
17778 }
17779
17780 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
17781    Used to output the .cfi_b_key_frame directive when signing the current
17782    function with the B key.  */
17783
17784 void
17785 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
17786 {
17787   if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
17788       && aarch64_ra_sign_key == AARCH64_KEY_B)
17789         asm_fprintf (f, "\t.cfi_b_key_frame\n");
17790 }
17791
17792 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
17793
17794 static void
17795 aarch64_start_file (void)
17796 {
17797   struct cl_target_option *default_options
17798     = TREE_TARGET_OPTION (target_option_default_node);
17799
17800   const struct processor *default_arch
17801     = aarch64_get_arch (default_options->x_explicit_arch);
17802   uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
17803   std::string extension
17804     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
17805                                                   default_arch->flags);
17806
17807    aarch64_last_printed_arch_string = default_arch->name + extension;
17808    aarch64_last_printed_tune_string = "";
17809    asm_fprintf (asm_out_file, "\t.arch %s\n",
17810                 aarch64_last_printed_arch_string.c_str ());
17811
17812    default_file_start ();
17813 }
17814
17815 /* Emit load exclusive.  */
17816
17817 static void
17818 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
17819                              rtx mem, rtx model_rtx)
17820 {
17821   if (mode == TImode)
17822     emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
17823                                                 gen_highpart (DImode, rval),
17824                                                 mem, model_rtx));
17825   else
17826     emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
17827 }
17828
17829 /* Emit store exclusive.  */
17830
17831 static void
17832 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
17833                               rtx mem, rtx rval, rtx model_rtx)
17834 {
17835   if (mode == TImode)
17836     emit_insn (gen_aarch64_store_exclusive_pair
17837                (bval, mem, operand_subword (rval, 0, 0, TImode),
17838                 operand_subword (rval, 1, 0, TImode), model_rtx));
17839   else
17840     emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
17841 }
17842
17843 /* Mark the previous jump instruction as unlikely.  */
17844
17845 static void
17846 aarch64_emit_unlikely_jump (rtx insn)
17847 {
17848   rtx_insn *jump = emit_jump_insn (insn);
17849   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
17850 }
17851
17852 /* We store the names of the various atomic helpers in a 5x4 array.
17853    Return the libcall function given MODE, MODEL and NAMES.  */
17854
17855 rtx
17856 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
17857                         const atomic_ool_names *names)
17858 {
17859   memmodel model = memmodel_base (INTVAL (model_rtx));
17860   int mode_idx, model_idx;
17861
17862   switch (mode)
17863     {
17864     case E_QImode:
17865       mode_idx = 0;
17866       break;
17867     case E_HImode:
17868       mode_idx = 1;
17869       break;
17870     case E_SImode:
17871       mode_idx = 2;
17872       break;
17873     case E_DImode:
17874       mode_idx = 3;
17875       break;
17876     case E_TImode:
17877       mode_idx = 4;
17878       break;
17879     default:
17880       gcc_unreachable ();
17881     }
17882
17883   switch (model)
17884     {
17885     case MEMMODEL_RELAXED:
17886       model_idx = 0;
17887       break;
17888     case MEMMODEL_CONSUME:
17889     case MEMMODEL_ACQUIRE:
17890       model_idx = 1;
17891       break;
17892     case MEMMODEL_RELEASE:
17893       model_idx = 2;
17894       break;
17895     case MEMMODEL_ACQ_REL:
17896     case MEMMODEL_SEQ_CST:
17897       model_idx = 3;
17898       break;
17899     default:
17900       gcc_unreachable ();
17901     }
17902
17903   return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
17904                                       VISIBILITY_HIDDEN);
17905 }
17906
17907 #define DEF0(B, N) \
17908   { "__aarch64_" #B #N "_relax", \
17909     "__aarch64_" #B #N "_acq", \
17910     "__aarch64_" #B #N "_rel", \
17911     "__aarch64_" #B #N "_acq_rel" }
17912
17913 #define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
17914                  { NULL, NULL, NULL, NULL }
17915 #define DEF5(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
17916
17917 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
17918 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
17919 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
17920 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
17921 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
17922 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
17923
17924 #undef DEF0
17925 #undef DEF4
17926 #undef DEF5
17927
17928 /* Expand a compare and swap pattern.  */
17929
17930 void
17931 aarch64_expand_compare_and_swap (rtx operands[])
17932 {
17933   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
17934   machine_mode mode, r_mode;
17935
17936   bval = operands[0];
17937   rval = operands[1];
17938   mem = operands[2];
17939   oldval = operands[3];
17940   newval = operands[4];
17941   is_weak = operands[5];
17942   mod_s = operands[6];
17943   mod_f = operands[7];
17944   mode = GET_MODE (mem);
17945
17946   /* Normally the succ memory model must be stronger than fail, but in the
17947      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
17948      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
17949   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
17950       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
17951     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
17952
17953   r_mode = mode;
17954   if (mode == QImode || mode == HImode)
17955     {
17956       r_mode = SImode;
17957       rval = gen_reg_rtx (r_mode);
17958     }
17959
17960   if (TARGET_LSE)
17961     {
17962       /* The CAS insn requires oldval and rval overlap, but we need to
17963          have a copy of oldval saved across the operation to tell if
17964          the operation is successful.  */
17965       if (reg_overlap_mentioned_p (rval, oldval))
17966         rval = copy_to_mode_reg (r_mode, oldval);
17967       else
17968         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
17969
17970       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
17971                                                    newval, mod_s));
17972       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
17973     }
17974   else if (TARGET_OUTLINE_ATOMICS)
17975     {
17976       /* Oldval must satisfy compare afterward.  */
17977       if (!aarch64_plus_operand (oldval, mode))
17978         oldval = force_reg (mode, oldval);
17979       rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
17980       rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
17981                                       oldval, mode, newval, mode,
17982                                       XEXP (mem, 0), Pmode);
17983       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
17984     }
17985   else
17986     {
17987       /* The oldval predicate varies by mode.  Test it and force to reg.  */
17988       insn_code code = code_for_aarch64_compare_and_swap (mode);
17989       if (!insn_data[code].operand[2].predicate (oldval, mode))
17990         oldval = force_reg (mode, oldval);
17991
17992       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
17993                                  is_weak, mod_s, mod_f));
17994       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
17995     }
17996
17997   if (r_mode != mode)
17998     rval = gen_lowpart (mode, rval);
17999   emit_move_insn (operands[1], rval);
18000
18001   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
18002   emit_insn (gen_rtx_SET (bval, x));
18003 }
18004
18005 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
18006    sequence implementing an atomic operation.  */
18007
18008 static void
18009 aarch64_emit_post_barrier (enum memmodel model)
18010 {
18011   const enum memmodel base_model = memmodel_base (model);
18012
18013   if (is_mm_sync (model)
18014       && (base_model == MEMMODEL_ACQUIRE
18015           || base_model == MEMMODEL_ACQ_REL
18016           || base_model == MEMMODEL_SEQ_CST))
18017     {
18018       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
18019     }
18020 }
18021
18022 /* Split a compare and swap pattern.  */
18023
18024 void
18025 aarch64_split_compare_and_swap (rtx operands[])
18026 {
18027   rtx rval, mem, oldval, newval, scratch, x, model_rtx;
18028   machine_mode mode;
18029   bool is_weak;
18030   rtx_code_label *label1, *label2;
18031   enum memmodel model;
18032
18033   rval = operands[0];
18034   mem = operands[1];
18035   oldval = operands[2];
18036   newval = operands[3];
18037   is_weak = (operands[4] != const0_rtx);
18038   model_rtx = operands[5];
18039   scratch = operands[7];
18040   mode = GET_MODE (mem);
18041   model = memmodel_from_int (INTVAL (model_rtx));
18042
18043   /* When OLDVAL is zero and we want the strong version we can emit a tighter
18044     loop:
18045     .label1:
18046         LD[A]XR rval, [mem]
18047         CBNZ    rval, .label2
18048         ST[L]XR scratch, newval, [mem]
18049         CBNZ    scratch, .label1
18050     .label2:
18051         CMP     rval, 0.  */
18052   bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
18053                         oldval == const0_rtx && mode != TImode);
18054
18055   label1 = NULL;
18056   if (!is_weak)
18057     {
18058       label1 = gen_label_rtx ();
18059       emit_label (label1);
18060     }
18061   label2 = gen_label_rtx ();
18062
18063   /* The initial load can be relaxed for a __sync operation since a final
18064      barrier will be emitted to stop code hoisting.  */
18065   if (is_mm_sync (model))
18066     aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
18067   else
18068     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
18069
18070   if (strong_zero_p)
18071     x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
18072   else
18073     {
18074       rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
18075       x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
18076     }
18077   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18078                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
18079   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18080
18081   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
18082
18083   if (!is_weak)
18084     {
18085       if (aarch64_track_speculation)
18086         {
18087           /* Emit an explicit compare instruction, so that we can correctly
18088              track the condition codes.  */
18089           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
18090           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
18091         }
18092       else
18093         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
18094
18095       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18096                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
18097       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18098     }
18099   else
18100     aarch64_gen_compare_reg (NE, scratch, const0_rtx);
18101
18102   emit_label (label2);
18103
18104   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
18105      to set the condition flags.  If this is not used it will be removed by
18106      later passes.  */
18107   if (strong_zero_p)
18108     aarch64_gen_compare_reg (NE, rval, const0_rtx);
18109
18110   /* Emit any final barrier needed for a __sync operation.  */
18111   if (is_mm_sync (model))
18112     aarch64_emit_post_barrier (model);
18113 }
18114
18115 /* Split an atomic operation.  */
18116
18117 void
18118 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
18119                          rtx value, rtx model_rtx, rtx cond)
18120 {
18121   machine_mode mode = GET_MODE (mem);
18122   machine_mode wmode = (mode == DImode ? DImode : SImode);
18123   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
18124   const bool is_sync = is_mm_sync (model);
18125   rtx_code_label *label;
18126   rtx x;
18127
18128   /* Split the atomic operation into a sequence.  */
18129   label = gen_label_rtx ();
18130   emit_label (label);
18131
18132   if (new_out)
18133     new_out = gen_lowpart (wmode, new_out);
18134   if (old_out)
18135     old_out = gen_lowpart (wmode, old_out);
18136   else
18137     old_out = new_out;
18138   value = simplify_gen_subreg (wmode, value, mode, 0);
18139
18140   /* The initial load can be relaxed for a __sync operation since a final
18141      barrier will be emitted to stop code hoisting.  */
18142  if (is_sync)
18143     aarch64_emit_load_exclusive (mode, old_out, mem,
18144                                  GEN_INT (MEMMODEL_RELAXED));
18145   else
18146     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
18147
18148   switch (code)
18149     {
18150     case SET:
18151       new_out = value;
18152       break;
18153
18154     case NOT:
18155       x = gen_rtx_AND (wmode, old_out, value);
18156       emit_insn (gen_rtx_SET (new_out, x));
18157       x = gen_rtx_NOT (wmode, new_out);
18158       emit_insn (gen_rtx_SET (new_out, x));
18159       break;
18160
18161     case MINUS:
18162       if (CONST_INT_P (value))
18163         {
18164           value = GEN_INT (-INTVAL (value));
18165           code = PLUS;
18166         }
18167       /* Fall through.  */
18168
18169     default:
18170       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
18171       emit_insn (gen_rtx_SET (new_out, x));
18172       break;
18173     }
18174
18175   aarch64_emit_store_exclusive (mode, cond, mem,
18176                                 gen_lowpart (mode, new_out), model_rtx);
18177
18178   if (aarch64_track_speculation)
18179     {
18180       /* Emit an explicit compare instruction, so that we can correctly
18181          track the condition codes.  */
18182       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
18183       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
18184     }
18185   else
18186     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
18187
18188   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18189                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
18190   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18191
18192   /* Emit any final barrier needed for a __sync operation.  */
18193   if (is_sync)
18194     aarch64_emit_post_barrier (model);
18195 }
18196
18197 static void
18198 aarch64_init_libfuncs (void)
18199 {
18200    /* Half-precision float operations.  The compiler handles all operations
18201      with NULL libfuncs by converting to SFmode.  */
18202
18203   /* Conversions.  */
18204   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
18205   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
18206
18207   /* Arithmetic.  */
18208   set_optab_libfunc (add_optab, HFmode, NULL);
18209   set_optab_libfunc (sdiv_optab, HFmode, NULL);
18210   set_optab_libfunc (smul_optab, HFmode, NULL);
18211   set_optab_libfunc (neg_optab, HFmode, NULL);
18212   set_optab_libfunc (sub_optab, HFmode, NULL);
18213
18214   /* Comparisons.  */
18215   set_optab_libfunc (eq_optab, HFmode, NULL);
18216   set_optab_libfunc (ne_optab, HFmode, NULL);
18217   set_optab_libfunc (lt_optab, HFmode, NULL);
18218   set_optab_libfunc (le_optab, HFmode, NULL);
18219   set_optab_libfunc (ge_optab, HFmode, NULL);
18220   set_optab_libfunc (gt_optab, HFmode, NULL);
18221   set_optab_libfunc (unord_optab, HFmode, NULL);
18222 }
18223
18224 /* Target hook for c_mode_for_suffix.  */
18225 static machine_mode
18226 aarch64_c_mode_for_suffix (char suffix)
18227 {
18228   if (suffix == 'q')
18229     return TFmode;
18230
18231   return VOIDmode;
18232 }
18233
18234 /* We can only represent floating point constants which will fit in
18235    "quarter-precision" values.  These values are characterised by
18236    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
18237    by:
18238
18239    (-1)^s * (n/16) * 2^r
18240
18241    Where:
18242      's' is the sign bit.
18243      'n' is an integer in the range 16 <= n <= 31.
18244      'r' is an integer in the range -3 <= r <= 4.  */
18245
18246 /* Return true iff X can be represented by a quarter-precision
18247    floating point immediate operand X.  Note, we cannot represent 0.0.  */
18248 bool
18249 aarch64_float_const_representable_p (rtx x)
18250 {
18251   /* This represents our current view of how many bits
18252      make up the mantissa.  */
18253   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
18254   int exponent;
18255   unsigned HOST_WIDE_INT mantissa, mask;
18256   REAL_VALUE_TYPE r, m;
18257   bool fail;
18258
18259   x = unwrap_const_vec_duplicate (x);
18260   if (!CONST_DOUBLE_P (x))
18261     return false;
18262
18263   if (GET_MODE (x) == VOIDmode
18264       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
18265     return false;
18266
18267   r = *CONST_DOUBLE_REAL_VALUE (x);
18268
18269   /* We cannot represent infinities, NaNs or +/-zero.  We won't
18270      know if we have +zero until we analyse the mantissa, but we
18271      can reject the other invalid values.  */
18272   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
18273       || REAL_VALUE_MINUS_ZERO (r))
18274     return false;
18275
18276   /* Extract exponent.  */
18277   r = real_value_abs (&r);
18278   exponent = REAL_EXP (&r);
18279
18280   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
18281      highest (sign) bit, with a fixed binary point at bit point_pos.
18282      m1 holds the low part of the mantissa, m2 the high part.
18283      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
18284      bits for the mantissa, this can fail (low bits will be lost).  */
18285   real_ldexp (&m, &r, point_pos - exponent);
18286   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
18287
18288   /* If the low part of the mantissa has bits set we cannot represent
18289      the value.  */
18290   if (w.ulow () != 0)
18291     return false;
18292   /* We have rejected the lower HOST_WIDE_INT, so update our
18293      understanding of how many bits lie in the mantissa and
18294      look only at the high HOST_WIDE_INT.  */
18295   mantissa = w.elt (1);
18296   point_pos -= HOST_BITS_PER_WIDE_INT;
18297
18298   /* We can only represent values with a mantissa of the form 1.xxxx.  */
18299   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
18300   if ((mantissa & mask) != 0)
18301     return false;
18302
18303   /* Having filtered unrepresentable values, we may now remove all
18304      but the highest 5 bits.  */
18305   mantissa >>= point_pos - 5;
18306
18307   /* We cannot represent the value 0.0, so reject it.  This is handled
18308      elsewhere.  */
18309   if (mantissa == 0)
18310     return false;
18311
18312   /* Then, as bit 4 is always set, we can mask it off, leaving
18313      the mantissa in the range [0, 15].  */
18314   mantissa &= ~(1 << 4);
18315   gcc_assert (mantissa <= 15);
18316
18317   /* GCC internally does not use IEEE754-like encoding (where normalized
18318      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
18319      Our mantissa values are shifted 4 places to the left relative to
18320      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
18321      by 5 places to correct for GCC's representation.  */
18322   exponent = 5 - exponent;
18323
18324   return (exponent >= 0 && exponent <= 7);
18325 }
18326
18327 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
18328    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
18329    output MOVI/MVNI, ORR or BIC immediate.  */
18330 char*
18331 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
18332                                    enum simd_immediate_check which)
18333 {
18334   bool is_valid;
18335   static char templ[40];
18336   const char *mnemonic;
18337   const char *shift_op;
18338   unsigned int lane_count = 0;
18339   char element_char;
18340
18341   struct simd_immediate_info info;
18342
18343   /* This will return true to show const_vector is legal for use as either
18344      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
18345      It will also update INFO to show how the immediate should be generated.
18346      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
18347   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
18348   gcc_assert (is_valid);
18349
18350   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
18351   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
18352
18353   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
18354     {
18355       gcc_assert (info.insn == simd_immediate_info::MOV
18356                   && info.u.mov.shift == 0);
18357       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
18358          move immediate path.  */
18359       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
18360         info.u.mov.value = GEN_INT (0);
18361       else
18362         {
18363           const unsigned int buf_size = 20;
18364           char float_buf[buf_size] = {'\0'};
18365           real_to_decimal_for_mode (float_buf,
18366                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
18367                                     buf_size, buf_size, 1, info.elt_mode);
18368
18369           if (lane_count == 1)
18370             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
18371           else
18372             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
18373                       lane_count, element_char, float_buf);
18374           return templ;
18375         }
18376     }
18377
18378   gcc_assert (CONST_INT_P (info.u.mov.value));
18379
18380   if (which == AARCH64_CHECK_MOV)
18381     {
18382       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
18383       shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
18384                   ? "msl" : "lsl");
18385       if (lane_count == 1)
18386         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
18387                   mnemonic, UINTVAL (info.u.mov.value));
18388       else if (info.u.mov.shift)
18389         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
18390                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
18391                   element_char, UINTVAL (info.u.mov.value), shift_op,
18392                   info.u.mov.shift);
18393       else
18394         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
18395                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
18396                   element_char, UINTVAL (info.u.mov.value));
18397     }
18398   else
18399     {
18400       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
18401       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
18402       if (info.u.mov.shift)
18403         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
18404                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
18405                   element_char, UINTVAL (info.u.mov.value), "lsl",
18406                   info.u.mov.shift);
18407       else
18408         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
18409                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
18410                   element_char, UINTVAL (info.u.mov.value));
18411     }
18412   return templ;
18413 }
18414
18415 char*
18416 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
18417 {
18418
18419   /* If a floating point number was passed and we desire to use it in an
18420      integer mode do the conversion to integer.  */
18421   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
18422     {
18423       unsigned HOST_WIDE_INT ival;
18424       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
18425           gcc_unreachable ();
18426       immediate = gen_int_mode (ival, mode);
18427     }
18428
18429   machine_mode vmode;
18430   /* use a 64 bit mode for everything except for DI/DF mode, where we use
18431      a 128 bit vector mode.  */
18432   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
18433
18434   vmode = aarch64_simd_container_mode (mode, width);
18435   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
18436   return aarch64_output_simd_mov_immediate (v_op, width);
18437 }
18438
18439 /* Return the output string to use for moving immediate CONST_VECTOR
18440    into an SVE register.  */
18441
18442 char *
18443 aarch64_output_sve_mov_immediate (rtx const_vector)
18444 {
18445   static char templ[40];
18446   struct simd_immediate_info info;
18447   char element_char;
18448
18449   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
18450   gcc_assert (is_valid);
18451
18452   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
18453
18454   machine_mode vec_mode = GET_MODE (const_vector);
18455   if (aarch64_sve_pred_mode_p (vec_mode))
18456     {
18457       static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
18458       if (info.insn == simd_immediate_info::MOV)
18459         {
18460           gcc_assert (info.u.mov.value == const0_rtx);
18461           snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
18462         }
18463       else
18464         {
18465           gcc_assert (info.insn == simd_immediate_info::PTRUE);
18466           unsigned int total_bytes;
18467           if (info.u.pattern == AARCH64_SV_ALL
18468               && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
18469             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
18470                       total_bytes / GET_MODE_SIZE (info.elt_mode));
18471           else
18472             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
18473                       svpattern_token (info.u.pattern));
18474         }
18475       return buf;
18476     }
18477
18478   if (info.insn == simd_immediate_info::INDEX)
18479     {
18480       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
18481                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
18482                 element_char, INTVAL (info.u.index.base),
18483                 INTVAL (info.u.index.step));
18484       return templ;
18485     }
18486
18487   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
18488     {
18489       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
18490         info.u.mov.value = GEN_INT (0);
18491       else
18492         {
18493           const int buf_size = 20;
18494           char float_buf[buf_size] = {};
18495           real_to_decimal_for_mode (float_buf,
18496                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
18497                                     buf_size, buf_size, 1, info.elt_mode);
18498
18499           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
18500                     element_char, float_buf);
18501           return templ;
18502         }
18503     }
18504
18505   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
18506             element_char, INTVAL (info.u.mov.value));
18507   return templ;
18508 }
18509
18510 /* Return the asm template for a PTRUES.  CONST_UNSPEC is the
18511    aarch64_sve_ptrue_svpattern_immediate that describes the predicate
18512    pattern.  */
18513
18514 char *
18515 aarch64_output_sve_ptrues (rtx const_unspec)
18516 {
18517   static char templ[40];
18518
18519   struct simd_immediate_info info;
18520   bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
18521   gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
18522
18523   char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
18524   snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
18525             svpattern_token (info.u.pattern));
18526   return templ;
18527 }
18528
18529 /* Split operands into moves from op[1] + op[2] into op[0].  */
18530
18531 void
18532 aarch64_split_combinev16qi (rtx operands[3])
18533 {
18534   unsigned int dest = REGNO (operands[0]);
18535   unsigned int src1 = REGNO (operands[1]);
18536   unsigned int src2 = REGNO (operands[2]);
18537   machine_mode halfmode = GET_MODE (operands[1]);
18538   unsigned int halfregs = REG_NREGS (operands[1]);
18539   rtx destlo, desthi;
18540
18541   gcc_assert (halfmode == V16QImode);
18542
18543   if (src1 == dest && src2 == dest + halfregs)
18544     {
18545       /* No-op move.  Can't split to nothing; emit something.  */
18546       emit_note (NOTE_INSN_DELETED);
18547       return;
18548     }
18549
18550   /* Preserve register attributes for variable tracking.  */
18551   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
18552   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
18553                                GET_MODE_SIZE (halfmode));
18554
18555   /* Special case of reversed high/low parts.  */
18556   if (reg_overlap_mentioned_p (operands[2], destlo)
18557       && reg_overlap_mentioned_p (operands[1], desthi))
18558     {
18559       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
18560       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
18561       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
18562     }
18563   else if (!reg_overlap_mentioned_p (operands[2], destlo))
18564     {
18565       /* Try to avoid unnecessary moves if part of the result
18566          is in the right place already.  */
18567       if (src1 != dest)
18568         emit_move_insn (destlo, operands[1]);
18569       if (src2 != dest + halfregs)
18570         emit_move_insn (desthi, operands[2]);
18571     }
18572   else
18573     {
18574       if (src2 != dest + halfregs)
18575         emit_move_insn (desthi, operands[2]);
18576       if (src1 != dest)
18577         emit_move_insn (destlo, operands[1]);
18578     }
18579 }
18580
18581 /* vec_perm support.  */
18582
18583 struct expand_vec_perm_d
18584 {
18585   rtx target, op0, op1;
18586   vec_perm_indices perm;
18587   machine_mode vmode;
18588   unsigned int vec_flags;
18589   bool one_vector_p;
18590   bool testing_p;
18591 };
18592
18593 /* Generate a variable permutation.  */
18594
18595 static void
18596 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
18597 {
18598   machine_mode vmode = GET_MODE (target);
18599   bool one_vector_p = rtx_equal_p (op0, op1);
18600
18601   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
18602   gcc_checking_assert (GET_MODE (op0) == vmode);
18603   gcc_checking_assert (GET_MODE (op1) == vmode);
18604   gcc_checking_assert (GET_MODE (sel) == vmode);
18605   gcc_checking_assert (TARGET_SIMD);
18606
18607   if (one_vector_p)
18608     {
18609       if (vmode == V8QImode)
18610         {
18611           /* Expand the argument to a V16QI mode by duplicating it.  */
18612           rtx pair = gen_reg_rtx (V16QImode);
18613           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
18614           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
18615         }
18616       else
18617         {
18618           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
18619         }
18620     }
18621   else
18622     {
18623       rtx pair;
18624
18625       if (vmode == V8QImode)
18626         {
18627           pair = gen_reg_rtx (V16QImode);
18628           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
18629           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
18630         }
18631       else
18632         {
18633           pair = gen_reg_rtx (OImode);
18634           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
18635           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
18636         }
18637     }
18638 }
18639
18640 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
18641    NELT is the number of elements in the vector.  */
18642
18643 void
18644 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
18645                          unsigned int nelt)
18646 {
18647   machine_mode vmode = GET_MODE (target);
18648   bool one_vector_p = rtx_equal_p (op0, op1);
18649   rtx mask;
18650
18651   /* The TBL instruction does not use a modulo index, so we must take care
18652      of that ourselves.  */
18653   mask = aarch64_simd_gen_const_vector_dup (vmode,
18654       one_vector_p ? nelt - 1 : 2 * nelt - 1);
18655   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
18656
18657   /* For big-endian, we also need to reverse the index within the vector
18658      (but not which vector).  */
18659   if (BYTES_BIG_ENDIAN)
18660     {
18661       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
18662       if (!one_vector_p)
18663         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
18664       sel = expand_simple_binop (vmode, XOR, sel, mask,
18665                                  NULL, 0, OPTAB_LIB_WIDEN);
18666     }
18667   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
18668 }
18669
18670 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
18671
18672 static void
18673 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
18674 {
18675   emit_insn (gen_rtx_SET (target,
18676                           gen_rtx_UNSPEC (GET_MODE (target),
18677                                           gen_rtvec (2, op0, op1), code)));
18678 }
18679
18680 /* Expand an SVE vec_perm with the given operands.  */
18681
18682 void
18683 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
18684 {
18685   machine_mode data_mode = GET_MODE (target);
18686   machine_mode sel_mode = GET_MODE (sel);
18687   /* Enforced by the pattern condition.  */
18688   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
18689
18690   /* Note: vec_perm indices are supposed to wrap when they go beyond the
18691      size of the two value vectors, i.e. the upper bits of the indices
18692      are effectively ignored.  SVE TBL instead produces 0 for any
18693      out-of-range indices, so we need to modulo all the vec_perm indices
18694      to ensure they are all in range.  */
18695   rtx sel_reg = force_reg (sel_mode, sel);
18696
18697   /* Check if the sel only references the first values vector.  */
18698   if (GET_CODE (sel) == CONST_VECTOR
18699       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
18700     {
18701       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
18702       return;
18703     }
18704
18705   /* Check if the two values vectors are the same.  */
18706   if (rtx_equal_p (op0, op1))
18707     {
18708       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
18709       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
18710                                          NULL, 0, OPTAB_DIRECT);
18711       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
18712       return;
18713     }
18714
18715   /* Run TBL on for each value vector and combine the results.  */
18716
18717   rtx res0 = gen_reg_rtx (data_mode);
18718   rtx res1 = gen_reg_rtx (data_mode);
18719   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
18720   if (GET_CODE (sel) != CONST_VECTOR
18721       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
18722     {
18723       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
18724                                                        2 * nunits - 1);
18725       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
18726                                      NULL, 0, OPTAB_DIRECT);
18727     }
18728   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
18729   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
18730                                      NULL, 0, OPTAB_DIRECT);
18731   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
18732   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
18733     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
18734   else
18735     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
18736 }
18737
18738 /* Recognize patterns suitable for the TRN instructions.  */
18739 static bool
18740 aarch64_evpc_trn (struct expand_vec_perm_d *d)
18741 {
18742   HOST_WIDE_INT odd;
18743   poly_uint64 nelt = d->perm.length ();
18744   rtx out, in0, in1, x;
18745   machine_mode vmode = d->vmode;
18746
18747   if (GET_MODE_UNIT_SIZE (vmode) > 8)
18748     return false;
18749
18750   /* Note that these are little-endian tests.
18751      We correct for big-endian later.  */
18752   if (!d->perm[0].is_constant (&odd)
18753       || (odd != 0 && odd != 1)
18754       || !d->perm.series_p (0, 2, odd, 2)
18755       || !d->perm.series_p (1, 2, nelt + odd, 2))
18756     return false;
18757
18758   /* Success!  */
18759   if (d->testing_p)
18760     return true;
18761
18762   in0 = d->op0;
18763   in1 = d->op1;
18764   /* We don't need a big-endian lane correction for SVE; see the comment
18765      at the head of aarch64-sve.md for details.  */
18766   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
18767     {
18768       x = in0, in0 = in1, in1 = x;
18769       odd = !odd;
18770     }
18771   out = d->target;
18772
18773   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
18774                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
18775   return true;
18776 }
18777
18778 /* Recognize patterns suitable for the UZP instructions.  */
18779 static bool
18780 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
18781 {
18782   HOST_WIDE_INT odd;
18783   rtx out, in0, in1, x;
18784   machine_mode vmode = d->vmode;
18785
18786   if (GET_MODE_UNIT_SIZE (vmode) > 8)
18787     return false;
18788
18789   /* Note that these are little-endian tests.
18790      We correct for big-endian later.  */
18791   if (!d->perm[0].is_constant (&odd)
18792       || (odd != 0 && odd != 1)
18793       || !d->perm.series_p (0, 1, odd, 2))
18794     return false;
18795
18796   /* Success!  */
18797   if (d->testing_p)
18798     return true;
18799
18800   in0 = d->op0;
18801   in1 = d->op1;
18802   /* We don't need a big-endian lane correction for SVE; see the comment
18803      at the head of aarch64-sve.md for details.  */
18804   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
18805     {
18806       x = in0, in0 = in1, in1 = x;
18807       odd = !odd;
18808     }
18809   out = d->target;
18810
18811   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
18812                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
18813   return true;
18814 }
18815
18816 /* Recognize patterns suitable for the ZIP instructions.  */
18817 static bool
18818 aarch64_evpc_zip (struct expand_vec_perm_d *d)
18819 {
18820   unsigned int high;
18821   poly_uint64 nelt = d->perm.length ();
18822   rtx out, in0, in1, x;
18823   machine_mode vmode = d->vmode;
18824
18825   if (GET_MODE_UNIT_SIZE (vmode) > 8)
18826     return false;
18827
18828   /* Note that these are little-endian tests.
18829      We correct for big-endian later.  */
18830   poly_uint64 first = d->perm[0];
18831   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
18832       || !d->perm.series_p (0, 2, first, 1)
18833       || !d->perm.series_p (1, 2, first + nelt, 1))
18834     return false;
18835   high = maybe_ne (first, 0U);
18836
18837   /* Success!  */
18838   if (d->testing_p)
18839     return true;
18840
18841   in0 = d->op0;
18842   in1 = d->op1;
18843   /* We don't need a big-endian lane correction for SVE; see the comment
18844      at the head of aarch64-sve.md for details.  */
18845   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
18846     {
18847       x = in0, in0 = in1, in1 = x;
18848       high = !high;
18849     }
18850   out = d->target;
18851
18852   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
18853                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
18854   return true;
18855 }
18856
18857 /* Recognize patterns for the EXT insn.  */
18858
18859 static bool
18860 aarch64_evpc_ext (struct expand_vec_perm_d *d)
18861 {
18862   HOST_WIDE_INT location;
18863   rtx offset;
18864
18865   /* The first element always refers to the first vector.
18866      Check if the extracted indices are increasing by one.  */
18867   if (d->vec_flags == VEC_SVE_PRED
18868       || !d->perm[0].is_constant (&location)
18869       || !d->perm.series_p (0, 1, location, 1))
18870     return false;
18871
18872   /* Success! */
18873   if (d->testing_p)
18874     return true;
18875
18876   /* The case where (location == 0) is a no-op for both big- and little-endian,
18877      and is removed by the mid-end at optimization levels -O1 and higher.
18878
18879      We don't need a big-endian lane correction for SVE; see the comment
18880      at the head of aarch64-sve.md for details.  */
18881   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
18882     {
18883       /* After setup, we want the high elements of the first vector (stored
18884          at the LSB end of the register), and the low elements of the second
18885          vector (stored at the MSB end of the register). So swap.  */
18886       std::swap (d->op0, d->op1);
18887       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
18888          to_constant () is safe since this is restricted to Advanced SIMD
18889          vectors.  */
18890       location = d->perm.length ().to_constant () - location;
18891     }
18892
18893   offset = GEN_INT (location);
18894   emit_set_insn (d->target,
18895                  gen_rtx_UNSPEC (d->vmode,
18896                                  gen_rtvec (3, d->op0, d->op1, offset),
18897                                  UNSPEC_EXT));
18898   return true;
18899 }
18900
18901 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
18902    within each 64-bit, 32-bit or 16-bit granule.  */
18903
18904 static bool
18905 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
18906 {
18907   HOST_WIDE_INT diff;
18908   unsigned int i, size, unspec;
18909   machine_mode pred_mode;
18910
18911   if (d->vec_flags == VEC_SVE_PRED
18912       || !d->one_vector_p
18913       || !d->perm[0].is_constant (&diff))
18914     return false;
18915
18916   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
18917   if (size == 8)
18918     {
18919       unspec = UNSPEC_REV64;
18920       pred_mode = VNx2BImode;
18921     }
18922   else if (size == 4)
18923     {
18924       unspec = UNSPEC_REV32;
18925       pred_mode = VNx4BImode;
18926     }
18927   else if (size == 2)
18928     {
18929       unspec = UNSPEC_REV16;
18930       pred_mode = VNx8BImode;
18931     }
18932   else
18933     return false;
18934
18935   unsigned int step = diff + 1;
18936   for (i = 0; i < step; ++i)
18937     if (!d->perm.series_p (i, step, diff - i, step))
18938       return false;
18939
18940   /* Success! */
18941   if (d->testing_p)
18942     return true;
18943
18944   if (d->vec_flags == VEC_SVE_DATA)
18945     {
18946       machine_mode int_mode = aarch64_sve_int_mode (pred_mode);
18947       rtx target = gen_reg_rtx (int_mode);
18948       if (BYTES_BIG_ENDIAN)
18949         /* The act of taking a subreg between INT_MODE and d->vmode
18950            is itself a reversing operation on big-endian targets;
18951            see the comment at the head of aarch64-sve.md for details.
18952            First reinterpret OP0 as INT_MODE without using a subreg
18953            and without changing the contents.  */
18954         emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0));
18955       else
18956         {
18957           /* For SVE we use REV[BHW] unspecs derived from the element size
18958              of v->mode and vector modes whose elements have SIZE bytes.
18959              This ensures that the vector modes match the predicate modes.  */
18960           int unspec = aarch64_sve_rev_unspec (d->vmode);
18961           rtx pred = aarch64_ptrue_reg (pred_mode);
18962           emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred,
18963                                        gen_lowpart (int_mode, d->op0)));
18964         }
18965       emit_move_insn (d->target, gen_lowpart (d->vmode, target));
18966       return true;
18967     }
18968   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
18969   emit_set_insn (d->target, src);
18970   return true;
18971 }
18972
18973 /* Recognize patterns for the REV insn, which reverses elements within
18974    a full vector.  */
18975
18976 static bool
18977 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
18978 {
18979   poly_uint64 nelt = d->perm.length ();
18980
18981   if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
18982     return false;
18983
18984   if (!d->perm.series_p (0, 1, nelt - 1, -1))
18985     return false;
18986
18987   /* Success! */
18988   if (d->testing_p)
18989     return true;
18990
18991   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
18992   emit_set_insn (d->target, src);
18993   return true;
18994 }
18995
18996 static bool
18997 aarch64_evpc_dup (struct expand_vec_perm_d *d)
18998 {
18999   rtx out = d->target;
19000   rtx in0;
19001   HOST_WIDE_INT elt;
19002   machine_mode vmode = d->vmode;
19003   rtx lane;
19004
19005   if (d->vec_flags == VEC_SVE_PRED
19006       || d->perm.encoding ().encoded_nelts () != 1
19007       || !d->perm[0].is_constant (&elt))
19008     return false;
19009
19010   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
19011     return false;
19012
19013   /* Success! */
19014   if (d->testing_p)
19015     return true;
19016
19017   /* The generic preparation in aarch64_expand_vec_perm_const_1
19018      swaps the operand order and the permute indices if it finds
19019      d->perm[0] to be in the second operand.  Thus, we can always
19020      use d->op0 and need not do any extra arithmetic to get the
19021      correct lane number.  */
19022   in0 = d->op0;
19023   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
19024
19025   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
19026   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
19027   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
19028   return true;
19029 }
19030
19031 static bool
19032 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
19033 {
19034   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
19035   machine_mode vmode = d->vmode;
19036
19037   /* Make sure that the indices are constant.  */
19038   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
19039   for (unsigned int i = 0; i < encoded_nelts; ++i)
19040     if (!d->perm[i].is_constant ())
19041       return false;
19042
19043   if (d->testing_p)
19044     return true;
19045
19046   /* Generic code will try constant permutation twice.  Once with the
19047      original mode and again with the elements lowered to QImode.
19048      So wait and don't do the selector expansion ourselves.  */
19049   if (vmode != V8QImode && vmode != V16QImode)
19050     return false;
19051
19052   /* to_constant is safe since this routine is specific to Advanced SIMD
19053      vectors.  */
19054   unsigned int nelt = d->perm.length ().to_constant ();
19055   for (unsigned int i = 0; i < nelt; ++i)
19056     /* If big-endian and two vectors we end up with a weird mixed-endian
19057        mode on NEON.  Reverse the index within each word but not the word
19058        itself.  to_constant is safe because we checked is_constant above.  */
19059     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
19060                         ? d->perm[i].to_constant () ^ (nelt - 1)
19061                         : d->perm[i].to_constant ());
19062
19063   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
19064   sel = force_reg (vmode, sel);
19065
19066   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
19067   return true;
19068 }
19069
19070 /* Try to implement D using an SVE TBL instruction.  */
19071
19072 static bool
19073 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
19074 {
19075   unsigned HOST_WIDE_INT nelt;
19076
19077   /* Permuting two variable-length vectors could overflow the
19078      index range.  */
19079   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
19080     return false;
19081
19082   if (d->testing_p)
19083     return true;
19084
19085   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
19086   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
19087   if (d->one_vector_p)
19088     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
19089   else
19090     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
19091   return true;
19092 }
19093
19094 /* Try to implement D using SVE SEL instruction.  */
19095
19096 static bool
19097 aarch64_evpc_sel (struct expand_vec_perm_d *d)
19098 {
19099   machine_mode vmode = d->vmode;
19100   int unit_size = GET_MODE_UNIT_SIZE (vmode);
19101
19102   if (d->vec_flags != VEC_SVE_DATA
19103       || unit_size > 8)
19104     return false;
19105
19106   int n_patterns = d->perm.encoding ().npatterns ();
19107   poly_int64 vec_len = d->perm.length ();
19108
19109   for (int i = 0; i < n_patterns; ++i)
19110     if (!known_eq (d->perm[i], i)
19111         && !known_eq (d->perm[i], vec_len + i))
19112       return false;
19113
19114   for (int i = n_patterns; i < n_patterns * 2; i++)
19115     if (!d->perm.series_p (i, n_patterns, i, n_patterns)
19116         && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
19117       return false;
19118
19119   if (d->testing_p)
19120     return true;
19121
19122   machine_mode pred_mode = aarch64_sve_pred_mode (unit_size).require ();
19123
19124   rtx_vector_builder builder (pred_mode, n_patterns, 2);
19125   for (int i = 0; i < n_patterns * 2; i++)
19126     {
19127       rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
19128                                           : CONST0_RTX (BImode);
19129       builder.quick_push (elem);
19130     }
19131
19132   rtx const_vec = builder.build ();
19133   rtx pred = force_reg (pred_mode, const_vec);
19134   emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op1, d->op0, pred));
19135   return true;
19136 }
19137
19138 static bool
19139 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
19140 {
19141   /* The pattern matching functions above are written to look for a small
19142      number to begin the sequence (0, 1, N/2).  If we begin with an index
19143      from the second operand, we can swap the operands.  */
19144   poly_int64 nelt = d->perm.length ();
19145   if (known_ge (d->perm[0], nelt))
19146     {
19147       d->perm.rotate_inputs (1);
19148       std::swap (d->op0, d->op1);
19149     }
19150
19151   if ((d->vec_flags == VEC_ADVSIMD
19152        || d->vec_flags == VEC_SVE_DATA
19153        || d->vec_flags == VEC_SVE_PRED)
19154       && known_gt (nelt, 1))
19155     {
19156       if (aarch64_evpc_rev_local (d))
19157         return true;
19158       else if (aarch64_evpc_rev_global (d))
19159         return true;
19160       else if (aarch64_evpc_ext (d))
19161         return true;
19162       else if (aarch64_evpc_dup (d))
19163         return true;
19164       else if (aarch64_evpc_zip (d))
19165         return true;
19166       else if (aarch64_evpc_uzp (d))
19167         return true;
19168       else if (aarch64_evpc_trn (d))
19169         return true;
19170       else if (aarch64_evpc_sel (d))
19171         return true;
19172       if (d->vec_flags == VEC_SVE_DATA)
19173         return aarch64_evpc_sve_tbl (d);
19174       else if (d->vec_flags == VEC_ADVSIMD)
19175         return aarch64_evpc_tbl (d);
19176     }
19177   return false;
19178 }
19179
19180 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
19181
19182 static bool
19183 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
19184                                   rtx op1, const vec_perm_indices &sel)
19185 {
19186   struct expand_vec_perm_d d;
19187
19188   /* Check whether the mask can be applied to a single vector.  */
19189   if (sel.ninputs () == 1
19190       || (op0 && rtx_equal_p (op0, op1)))
19191     d.one_vector_p = true;
19192   else if (sel.all_from_input_p (0))
19193     {
19194       d.one_vector_p = true;
19195       op1 = op0;
19196     }
19197   else if (sel.all_from_input_p (1))
19198     {
19199       d.one_vector_p = true;
19200       op0 = op1;
19201     }
19202   else
19203     d.one_vector_p = false;
19204
19205   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
19206                      sel.nelts_per_input ());
19207   d.vmode = vmode;
19208   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
19209   d.target = target;
19210   d.op0 = op0;
19211   d.op1 = op1;
19212   d.testing_p = !target;
19213
19214   if (!d.testing_p)
19215     return aarch64_expand_vec_perm_const_1 (&d);
19216
19217   rtx_insn *last = get_last_insn ();
19218   bool ret = aarch64_expand_vec_perm_const_1 (&d);
19219   gcc_assert (last == get_last_insn ());
19220
19221   return ret;
19222 }
19223
19224 /* Generate a byte permute mask for a register of mode MODE,
19225    which has NUNITS units.  */
19226
19227 rtx
19228 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
19229 {
19230   /* We have to reverse each vector because we dont have
19231      a permuted load that can reverse-load according to ABI rules.  */
19232   rtx mask;
19233   rtvec v = rtvec_alloc (16);
19234   unsigned int i, j;
19235   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
19236
19237   gcc_assert (BYTES_BIG_ENDIAN);
19238   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
19239
19240   for (i = 0; i < nunits; i++)
19241     for (j = 0; j < usize; j++)
19242       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
19243   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
19244   return force_reg (V16QImode, mask);
19245 }
19246
19247 /* Expand an SVE integer comparison using the SVE equivalent of:
19248
19249      (set TARGET (CODE OP0 OP1)).  */
19250
19251 void
19252 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
19253 {
19254   machine_mode pred_mode = GET_MODE (target);
19255   machine_mode data_mode = GET_MODE (op0);
19256   rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
19257                                       op0, op1);
19258   if (!rtx_equal_p (target, res))
19259     emit_move_insn (target, res);
19260 }
19261
19262 /* Return the UNSPEC_COND_* code for comparison CODE.  */
19263
19264 static unsigned int
19265 aarch64_unspec_cond_code (rtx_code code)
19266 {
19267   switch (code)
19268     {
19269     case NE:
19270       return UNSPEC_COND_FCMNE;
19271     case EQ:
19272       return UNSPEC_COND_FCMEQ;
19273     case LT:
19274       return UNSPEC_COND_FCMLT;
19275     case GT:
19276       return UNSPEC_COND_FCMGT;
19277     case LE:
19278       return UNSPEC_COND_FCMLE;
19279     case GE:
19280       return UNSPEC_COND_FCMGE;
19281     case UNORDERED:
19282       return UNSPEC_COND_FCMUO;
19283     default:
19284       gcc_unreachable ();
19285     }
19286 }
19287
19288 /* Emit:
19289
19290       (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
19291
19292    where <X> is the operation associated with comparison CODE.
19293    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
19294
19295 static void
19296 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
19297                           bool known_ptrue_p, rtx op0, rtx op1)
19298 {
19299   rtx flag = gen_int_mode (known_ptrue_p, SImode);
19300   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
19301                                gen_rtvec (4, pred, flag, op0, op1),
19302                                aarch64_unspec_cond_code (code));
19303   emit_set_insn (target, unspec);
19304 }
19305
19306 /* Emit the SVE equivalent of:
19307
19308       (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
19309       (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
19310       (set TARGET (ior:PRED_MODE TMP1 TMP2))
19311
19312    where <Xi> is the operation associated with comparison CODEi.
19313    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
19314
19315 static void
19316 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
19317                               rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
19318 {
19319   machine_mode pred_mode = GET_MODE (pred);
19320   rtx tmp1 = gen_reg_rtx (pred_mode);
19321   aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
19322   rtx tmp2 = gen_reg_rtx (pred_mode);
19323   aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
19324   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
19325 }
19326
19327 /* Emit the SVE equivalent of:
19328
19329       (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
19330       (set TARGET (not TMP))
19331
19332    where <X> is the operation associated with comparison CODE.
19333    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
19334
19335 static void
19336 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
19337                                  bool known_ptrue_p, rtx op0, rtx op1)
19338 {
19339   machine_mode pred_mode = GET_MODE (pred);
19340   rtx tmp = gen_reg_rtx (pred_mode);
19341   aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
19342   aarch64_emit_unop (target, one_cmpl_optab, tmp);
19343 }
19344
19345 /* Expand an SVE floating-point comparison using the SVE equivalent of:
19346
19347      (set TARGET (CODE OP0 OP1))
19348
19349    If CAN_INVERT_P is true, the caller can also handle inverted results;
19350    return true if the result is in fact inverted.  */
19351
19352 bool
19353 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
19354                                   rtx op0, rtx op1, bool can_invert_p)
19355 {
19356   machine_mode pred_mode = GET_MODE (target);
19357   machine_mode data_mode = GET_MODE (op0);
19358
19359   rtx ptrue = aarch64_ptrue_reg (pred_mode);
19360   switch (code)
19361     {
19362     case UNORDERED:
19363       /* UNORDERED has no immediate form.  */
19364       op1 = force_reg (data_mode, op1);
19365       /* fall through */
19366     case LT:
19367     case LE:
19368     case GT:
19369     case GE:
19370     case EQ:
19371     case NE:
19372       {
19373         /* There is native support for the comparison.  */
19374         aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
19375         return false;
19376       }
19377
19378     case LTGT:
19379       /* This is a trapping operation (LT or GT).  */
19380       aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
19381       return false;
19382
19383     case UNEQ:
19384       if (!flag_trapping_math)
19385         {
19386           /* This would trap for signaling NaNs.  */
19387           op1 = force_reg (data_mode, op1);
19388           aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
19389                                         ptrue, true, op0, op1);
19390           return false;
19391         }
19392       /* fall through */
19393     case UNLT:
19394     case UNLE:
19395     case UNGT:
19396     case UNGE:
19397       if (flag_trapping_math)
19398         {
19399           /* Work out which elements are ordered.  */
19400           rtx ordered = gen_reg_rtx (pred_mode);
19401           op1 = force_reg (data_mode, op1);
19402           aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
19403                                            ptrue, true, op0, op1);
19404
19405           /* Test the opposite condition for the ordered elements,
19406              then invert the result.  */
19407           if (code == UNEQ)
19408             code = NE;
19409           else
19410             code = reverse_condition_maybe_unordered (code);
19411           if (can_invert_p)
19412             {
19413               aarch64_emit_sve_fp_cond (target, code,
19414                                         ordered, false, op0, op1);
19415               return true;
19416             }
19417           aarch64_emit_sve_invert_fp_cond (target, code,
19418                                            ordered, false, op0, op1);
19419           return false;
19420         }
19421       break;
19422
19423     case ORDERED:
19424       /* ORDERED has no immediate form.  */
19425       op1 = force_reg (data_mode, op1);
19426       break;
19427
19428     default:
19429       gcc_unreachable ();
19430     }
19431
19432   /* There is native support for the inverse comparison.  */
19433   code = reverse_condition_maybe_unordered (code);
19434   if (can_invert_p)
19435     {
19436       aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
19437       return true;
19438     }
19439   aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
19440   return false;
19441 }
19442
19443 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
19444    of the data being selected and CMP_MODE is the mode of the values being
19445    compared.  */
19446
19447 void
19448 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
19449                           rtx *ops)
19450 {
19451   machine_mode pred_mode
19452     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
19453                              GET_MODE_SIZE (cmp_mode)).require ();
19454   rtx pred = gen_reg_rtx (pred_mode);
19455   if (FLOAT_MODE_P (cmp_mode))
19456     {
19457       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
19458                                             ops[4], ops[5], true))
19459         std::swap (ops[1], ops[2]);
19460     }
19461   else
19462     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
19463
19464   if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
19465     ops[1] = force_reg (data_mode, ops[1]);
19466   /* The "false" value can only be zero if the "true" value is a constant.  */
19467   if (register_operand (ops[1], data_mode)
19468       || !aarch64_simd_reg_or_zero (ops[2], data_mode))
19469     ops[2] = force_reg (data_mode, ops[2]);
19470
19471   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
19472   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
19473 }
19474
19475 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
19476    true.  However due to issues with register allocation it is preferable
19477    to avoid tieing integer scalar and FP scalar modes.  Executing integer
19478    operations in general registers is better than treating them as scalar
19479    vector operations.  This reduces latency and avoids redundant int<->FP
19480    moves.  So tie modes if they are either the same class, or vector modes
19481    with other vector modes, vector structs or any scalar mode.  */
19482
19483 static bool
19484 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
19485 {
19486   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
19487     return true;
19488
19489   /* We specifically want to allow elements of "structure" modes to
19490      be tieable to the structure.  This more general condition allows
19491      other rarer situations too.  The reason we don't extend this to
19492      predicate modes is that there are no predicate structure modes
19493      nor any specific instructions for extracting part of a predicate
19494      register.  */
19495   if (aarch64_vector_data_mode_p (mode1)
19496       && aarch64_vector_data_mode_p (mode2))
19497     return true;
19498
19499   /* Also allow any scalar modes with vectors.  */
19500   if (aarch64_vector_mode_supported_p (mode1)
19501       || aarch64_vector_mode_supported_p (mode2))
19502     return true;
19503
19504   return false;
19505 }
19506
19507 /* Return a new RTX holding the result of moving POINTER forward by
19508    AMOUNT bytes.  */
19509
19510 static rtx
19511 aarch64_move_pointer (rtx pointer, poly_int64 amount)
19512 {
19513   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
19514
19515   return adjust_automodify_address (pointer, GET_MODE (pointer),
19516                                     next, amount);
19517 }
19518
19519 /* Return a new RTX holding the result of moving POINTER forward by the
19520    size of the mode it points to.  */
19521
19522 static rtx
19523 aarch64_progress_pointer (rtx pointer)
19524 {
19525   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
19526 }
19527
19528 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
19529    MODE bytes.  */
19530
19531 static void
19532 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
19533                                               machine_mode mode)
19534 {
19535   rtx reg = gen_reg_rtx (mode);
19536
19537   /* "Cast" the pointers to the correct mode.  */
19538   *src = adjust_address (*src, mode, 0);
19539   *dst = adjust_address (*dst, mode, 0);
19540   /* Emit the memcpy.  */
19541   emit_move_insn (reg, *src);
19542   emit_move_insn (*dst, reg);
19543   /* Move the pointers forward.  */
19544   *src = aarch64_progress_pointer (*src);
19545   *dst = aarch64_progress_pointer (*dst);
19546 }
19547
19548 /* Expand cpymem, as if from a __builtin_memcpy.  Return true if
19549    we succeed, otherwise return false.  */
19550
19551 bool
19552 aarch64_expand_cpymem (rtx *operands)
19553 {
19554   int n, mode_bits;
19555   rtx dst = operands[0];
19556   rtx src = operands[1];
19557   rtx base;
19558   machine_mode cur_mode = BLKmode, next_mode;
19559   bool speed_p = !optimize_function_for_size_p (cfun);
19560
19561   /* When optimizing for size, give a better estimate of the length of a
19562      memcpy call, but use the default otherwise.  Moves larger than 8 bytes
19563      will always require an even number of instructions to do now.  And each
19564      operation requires both a load+store, so devide the max number by 2.  */
19565   int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
19566
19567   /* We can't do anything smart if the amount to copy is not constant.  */
19568   if (!CONST_INT_P (operands[2]))
19569     return false;
19570
19571   n = INTVAL (operands[2]);
19572
19573   /* Try to keep the number of instructions low.  For all cases we will do at
19574      most two moves for the residual amount, since we'll always overlap the
19575      remainder.  */
19576   if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
19577     return false;
19578
19579   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
19580   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
19581
19582   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
19583   src = adjust_automodify_address (src, VOIDmode, base, 0);
19584
19585   /* Convert n to bits to make the rest of the code simpler.  */
19586   n = n * BITS_PER_UNIT;
19587
19588   /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
19589      larger than TImode, but we should not use them for loads/stores here.  */
19590   const int copy_limit = GET_MODE_BITSIZE (TImode);
19591
19592   while (n > 0)
19593     {
19594       /* Find the largest mode in which to do the copy in without over reading
19595          or writing.  */
19596       opt_scalar_int_mode mode_iter;
19597       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
19598         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
19599           cur_mode = mode_iter.require ();
19600
19601       gcc_assert (cur_mode != BLKmode);
19602
19603       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
19604       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
19605
19606       n -= mode_bits;
19607
19608       /* Do certain trailing copies as overlapping if it's going to be
19609          cheaper.  i.e. less instructions to do so.  For instance doing a 15
19610          byte copy it's more efficient to do two overlapping 8 byte copies than
19611          8 + 6 + 1.  */
19612       if (n > 0 && n <= 8 * BITS_PER_UNIT)
19613         {
19614           next_mode = smallest_mode_for_size (n, MODE_INT);
19615           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
19616           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
19617           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
19618           n = n_bits;
19619         }
19620     }
19621
19622   return true;
19623 }
19624
19625 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
19626    SImode stores.  Handle the case when the constant has identical
19627    bottom and top halves.  This is beneficial when the two stores can be
19628    merged into an STP and we avoid synthesising potentially expensive
19629    immediates twice.  Return true if such a split is possible.  */
19630
19631 bool
19632 aarch64_split_dimode_const_store (rtx dst, rtx src)
19633 {
19634   rtx lo = gen_lowpart (SImode, src);
19635   rtx hi = gen_highpart_mode (SImode, DImode, src);
19636
19637   bool size_p = optimize_function_for_size_p (cfun);
19638
19639   if (!rtx_equal_p (lo, hi))
19640     return false;
19641
19642   unsigned int orig_cost
19643     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
19644   unsigned int lo_cost
19645     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
19646
19647   /* We want to transform:
19648      MOV        x1, 49370
19649      MOVK       x1, 0x140, lsl 16
19650      MOVK       x1, 0xc0da, lsl 32
19651      MOVK       x1, 0x140, lsl 48
19652      STR        x1, [x0]
19653    into:
19654      MOV        w1, 49370
19655      MOVK       w1, 0x140, lsl 16
19656      STP        w1, w1, [x0]
19657    So we want to perform this only when we save two instructions
19658    or more.  When optimizing for size, however, accept any code size
19659    savings we can.  */
19660   if (size_p && orig_cost <= lo_cost)
19661     return false;
19662
19663   if (!size_p
19664       && (orig_cost <= lo_cost + 1))
19665     return false;
19666
19667   rtx mem_lo = adjust_address (dst, SImode, 0);
19668   if (!aarch64_mem_pair_operand (mem_lo, SImode))
19669     return false;
19670
19671   rtx tmp_reg = gen_reg_rtx (SImode);
19672   aarch64_expand_mov_immediate (tmp_reg, lo);
19673   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
19674   /* Don't emit an explicit store pair as this may not be always profitable.
19675      Let the sched-fusion logic decide whether to merge them.  */
19676   emit_move_insn (mem_lo, tmp_reg);
19677   emit_move_insn (mem_hi, tmp_reg);
19678
19679   return true;
19680 }
19681
19682 /* Generate RTL for a conditional branch with rtx comparison CODE in
19683    mode CC_MODE.  The destination of the unlikely conditional branch
19684    is LABEL_REF.  */
19685
19686 void
19687 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
19688                               rtx label_ref)
19689 {
19690   rtx x;
19691   x = gen_rtx_fmt_ee (code, VOIDmode,
19692                       gen_rtx_REG (cc_mode, CC_REGNUM),
19693                       const0_rtx);
19694
19695   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
19696                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
19697                             pc_rtx);
19698   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
19699 }
19700
19701 /* Generate DImode scratch registers for 128-bit (TImode) addition.
19702
19703    OP1 represents the TImode destination operand 1
19704    OP2 represents the TImode destination operand 2
19705    LOW_DEST represents the low half (DImode) of TImode operand 0
19706    LOW_IN1 represents the low half (DImode) of TImode operand 1
19707    LOW_IN2 represents the low half (DImode) of TImode operand 2
19708    HIGH_DEST represents the high half (DImode) of TImode operand 0
19709    HIGH_IN1 represents the high half (DImode) of TImode operand 1
19710    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
19711
19712 void
19713 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
19714                             rtx *low_in1, rtx *low_in2,
19715                             rtx *high_dest, rtx *high_in1,
19716                             rtx *high_in2)
19717 {
19718   *low_dest = gen_reg_rtx (DImode);
19719   *low_in1 = gen_lowpart (DImode, op1);
19720   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
19721                                   subreg_lowpart_offset (DImode, TImode));
19722   *high_dest = gen_reg_rtx (DImode);
19723   *high_in1 = gen_highpart (DImode, op1);
19724   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
19725                                    subreg_highpart_offset (DImode, TImode));
19726 }
19727
19728 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
19729
19730    This function differs from 'arch64_addti_scratch_regs' in that
19731    OP1 can be an immediate constant (zero). We must call
19732    subreg_highpart_offset with DImode and TImode arguments, otherwise
19733    VOIDmode will be used for the const_int which generates an internal
19734    error from subreg_size_highpart_offset which does not expect a size of zero.
19735
19736    OP1 represents the TImode destination operand 1
19737    OP2 represents the TImode destination operand 2
19738    LOW_DEST represents the low half (DImode) of TImode operand 0
19739    LOW_IN1 represents the low half (DImode) of TImode operand 1
19740    LOW_IN2 represents the low half (DImode) of TImode operand 2
19741    HIGH_DEST represents the high half (DImode) of TImode operand 0
19742    HIGH_IN1 represents the high half (DImode) of TImode operand 1
19743    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
19744
19745
19746 void
19747 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
19748                              rtx *low_in1, rtx *low_in2,
19749                              rtx *high_dest, rtx *high_in1,
19750                              rtx *high_in2)
19751 {
19752   *low_dest = gen_reg_rtx (DImode);
19753   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
19754                                   subreg_lowpart_offset (DImode, TImode));
19755
19756   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
19757                                   subreg_lowpart_offset (DImode, TImode));
19758   *high_dest = gen_reg_rtx (DImode);
19759
19760   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
19761                                    subreg_highpart_offset (DImode, TImode));
19762   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
19763                                    subreg_highpart_offset (DImode, TImode));
19764 }
19765
19766 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
19767
19768    OP0 represents the TImode destination operand 0
19769    LOW_DEST represents the low half (DImode) of TImode operand 0
19770    LOW_IN1 represents the low half (DImode) of TImode operand 1
19771    LOW_IN2 represents the low half (DImode) of TImode operand 2
19772    HIGH_DEST represents the high half (DImode) of TImode operand 0
19773    HIGH_IN1 represents the high half (DImode) of TImode operand 1
19774    HIGH_IN2 represents the high half (DImode) of TImode operand 2
19775    UNSIGNED_P is true if the operation is being performed on unsigned
19776    values.  */
19777 void
19778 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
19779                        rtx low_in2, rtx high_dest, rtx high_in1,
19780                        rtx high_in2, bool unsigned_p)
19781 {
19782   if (low_in2 == const0_rtx)
19783     {
19784       low_dest = low_in1;
19785       high_in2 = force_reg (DImode, high_in2);
19786       if (unsigned_p)
19787         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
19788       else
19789         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
19790     }
19791   else
19792     {
19793       if (CONST_INT_P (low_in2))
19794         {
19795           high_in2 = force_reg (DImode, high_in2);
19796           emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
19797                                               GEN_INT (-INTVAL (low_in2))));
19798         }
19799       else
19800         emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
19801
19802       if (unsigned_p)
19803         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
19804       else
19805         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
19806     }
19807
19808   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
19809   emit_move_insn (gen_highpart (DImode, op0), high_dest);
19810
19811 }
19812
19813 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
19814
19815 static unsigned HOST_WIDE_INT
19816 aarch64_asan_shadow_offset (void)
19817 {
19818   if (TARGET_ILP32)
19819     return (HOST_WIDE_INT_1 << 29);
19820   else
19821     return (HOST_WIDE_INT_1 << 36);
19822 }
19823
19824 static rtx
19825 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
19826                         int code, tree treeop0, tree treeop1)
19827 {
19828   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
19829   rtx op0, op1;
19830   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
19831   insn_code icode;
19832   struct expand_operand ops[4];
19833
19834   start_sequence ();
19835   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
19836
19837   op_mode = GET_MODE (op0);
19838   if (op_mode == VOIDmode)
19839     op_mode = GET_MODE (op1);
19840
19841   switch (op_mode)
19842     {
19843     case E_QImode:
19844     case E_HImode:
19845     case E_SImode:
19846       cmp_mode = SImode;
19847       icode = CODE_FOR_cmpsi;
19848       break;
19849
19850     case E_DImode:
19851       cmp_mode = DImode;
19852       icode = CODE_FOR_cmpdi;
19853       break;
19854
19855     case E_SFmode:
19856       cmp_mode = SFmode;
19857       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
19858       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
19859       break;
19860
19861     case E_DFmode:
19862       cmp_mode = DFmode;
19863       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
19864       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
19865       break;
19866
19867     default:
19868       end_sequence ();
19869       return NULL_RTX;
19870     }
19871
19872   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
19873   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
19874   if (!op0 || !op1)
19875     {
19876       end_sequence ();
19877       return NULL_RTX;
19878     }
19879   *prep_seq = get_insns ();
19880   end_sequence ();
19881
19882   create_fixed_operand (&ops[0], op0);
19883   create_fixed_operand (&ops[1], op1);
19884
19885   start_sequence ();
19886   if (!maybe_expand_insn (icode, 2, ops))
19887     {
19888       end_sequence ();
19889       return NULL_RTX;
19890     }
19891   *gen_seq = get_insns ();
19892   end_sequence ();
19893
19894   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
19895                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
19896 }
19897
19898 static rtx
19899 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
19900                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
19901 {
19902   rtx op0, op1, target;
19903   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
19904   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
19905   insn_code icode;
19906   struct expand_operand ops[6];
19907   int aarch64_cond;
19908
19909   push_to_sequence (*prep_seq);
19910   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
19911
19912   op_mode = GET_MODE (op0);
19913   if (op_mode == VOIDmode)
19914     op_mode = GET_MODE (op1);
19915
19916   switch (op_mode)
19917     {
19918     case E_QImode:
19919     case E_HImode:
19920     case E_SImode:
19921       cmp_mode = SImode;
19922       icode = CODE_FOR_ccmpsi;
19923       break;
19924
19925     case E_DImode:
19926       cmp_mode = DImode;
19927       icode = CODE_FOR_ccmpdi;
19928       break;
19929
19930     case E_SFmode:
19931       cmp_mode = SFmode;
19932       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
19933       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
19934       break;
19935
19936     case E_DFmode:
19937       cmp_mode = DFmode;
19938       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
19939       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
19940       break;
19941
19942     default:
19943       end_sequence ();
19944       return NULL_RTX;
19945     }
19946
19947   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
19948   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
19949   if (!op0 || !op1)
19950     {
19951       end_sequence ();
19952       return NULL_RTX;
19953     }
19954   *prep_seq = get_insns ();
19955   end_sequence ();
19956
19957   target = gen_rtx_REG (cc_mode, CC_REGNUM);
19958   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
19959
19960   if (bit_code != AND)
19961     {
19962       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
19963                                                 GET_MODE (XEXP (prev, 0))),
19964                              VOIDmode, XEXP (prev, 0), const0_rtx);
19965       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
19966     }
19967
19968   create_fixed_operand (&ops[0], XEXP (prev, 0));
19969   create_fixed_operand (&ops[1], target);
19970   create_fixed_operand (&ops[2], op0);
19971   create_fixed_operand (&ops[3], op1);
19972   create_fixed_operand (&ops[4], prev);
19973   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
19974
19975   push_to_sequence (*gen_seq);
19976   if (!maybe_expand_insn (icode, 6, ops))
19977     {
19978       end_sequence ();
19979       return NULL_RTX;
19980     }
19981
19982   *gen_seq = get_insns ();
19983   end_sequence ();
19984
19985   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
19986 }
19987
19988 #undef TARGET_GEN_CCMP_FIRST
19989 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
19990
19991 #undef TARGET_GEN_CCMP_NEXT
19992 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
19993
19994 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
19995    instruction fusion of some sort.  */
19996
19997 static bool
19998 aarch64_macro_fusion_p (void)
19999 {
20000   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
20001 }
20002
20003
20004 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
20005    should be kept together during scheduling.  */
20006
20007 static bool
20008 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
20009 {
20010   rtx set_dest;
20011   rtx prev_set = single_set (prev);
20012   rtx curr_set = single_set (curr);
20013   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
20014   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
20015
20016   if (!aarch64_macro_fusion_p ())
20017     return false;
20018
20019   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
20020     {
20021       /* We are trying to match:
20022          prev (mov)  == (set (reg r0) (const_int imm16))
20023          curr (movk) == (set (zero_extract (reg r0)
20024                                            (const_int 16)
20025                                            (const_int 16))
20026                              (const_int imm16_1))  */
20027
20028       set_dest = SET_DEST (curr_set);
20029
20030       if (GET_CODE (set_dest) == ZERO_EXTRACT
20031           && CONST_INT_P (SET_SRC (curr_set))
20032           && CONST_INT_P (SET_SRC (prev_set))
20033           && CONST_INT_P (XEXP (set_dest, 2))
20034           && INTVAL (XEXP (set_dest, 2)) == 16
20035           && REG_P (XEXP (set_dest, 0))
20036           && REG_P (SET_DEST (prev_set))
20037           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
20038         {
20039           return true;
20040         }
20041     }
20042
20043   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
20044     {
20045
20046       /*  We're trying to match:
20047           prev (adrp) == (set (reg r1)
20048                               (high (symbol_ref ("SYM"))))
20049           curr (add) == (set (reg r0)
20050                              (lo_sum (reg r1)
20051                                      (symbol_ref ("SYM"))))
20052           Note that r0 need not necessarily be the same as r1, especially
20053           during pre-regalloc scheduling.  */
20054
20055       if (satisfies_constraint_Ush (SET_SRC (prev_set))
20056           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
20057         {
20058           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
20059               && REG_P (XEXP (SET_SRC (curr_set), 0))
20060               && REGNO (XEXP (SET_SRC (curr_set), 0))
20061                  == REGNO (SET_DEST (prev_set))
20062               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
20063                               XEXP (SET_SRC (curr_set), 1)))
20064             return true;
20065         }
20066     }
20067
20068   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
20069     {
20070
20071       /* We're trying to match:
20072          prev (movk) == (set (zero_extract (reg r0)
20073                                            (const_int 16)
20074                                            (const_int 32))
20075                              (const_int imm16_1))
20076          curr (movk) == (set (zero_extract (reg r0)
20077                                            (const_int 16)
20078                                            (const_int 48))
20079                              (const_int imm16_2))  */
20080
20081       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
20082           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
20083           && REG_P (XEXP (SET_DEST (prev_set), 0))
20084           && REG_P (XEXP (SET_DEST (curr_set), 0))
20085           && REGNO (XEXP (SET_DEST (prev_set), 0))
20086              == REGNO (XEXP (SET_DEST (curr_set), 0))
20087           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
20088           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
20089           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
20090           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
20091           && CONST_INT_P (SET_SRC (prev_set))
20092           && CONST_INT_P (SET_SRC (curr_set)))
20093         return true;
20094
20095     }
20096   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
20097     {
20098       /* We're trying to match:
20099           prev (adrp) == (set (reg r0)
20100                               (high (symbol_ref ("SYM"))))
20101           curr (ldr) == (set (reg r1)
20102                              (mem (lo_sum (reg r0)
20103                                              (symbol_ref ("SYM")))))
20104                  or
20105           curr (ldr) == (set (reg r1)
20106                              (zero_extend (mem
20107                                            (lo_sum (reg r0)
20108                                                    (symbol_ref ("SYM"))))))  */
20109       if (satisfies_constraint_Ush (SET_SRC (prev_set))
20110           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
20111         {
20112           rtx curr_src = SET_SRC (curr_set);
20113
20114           if (GET_CODE (curr_src) == ZERO_EXTEND)
20115             curr_src = XEXP (curr_src, 0);
20116
20117           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
20118               && REG_P (XEXP (XEXP (curr_src, 0), 0))
20119               && REGNO (XEXP (XEXP (curr_src, 0), 0))
20120                  == REGNO (SET_DEST (prev_set))
20121               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
20122                               XEXP (SET_SRC (prev_set), 0)))
20123               return true;
20124         }
20125     }
20126
20127   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
20128       && any_condjump_p (curr))
20129     {
20130       unsigned int condreg1, condreg2;
20131       rtx cc_reg_1;
20132       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
20133       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
20134
20135       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
20136           && prev
20137           && modified_in_p (cc_reg_1, prev))
20138         {
20139           enum attr_type prev_type = get_attr_type (prev);
20140
20141           /* FIXME: this misses some which is considered simple arthematic
20142              instructions for ThunderX.  Simple shifts are missed here.  */
20143           if (prev_type == TYPE_ALUS_SREG
20144               || prev_type == TYPE_ALUS_IMM
20145               || prev_type == TYPE_LOGICS_REG
20146               || prev_type == TYPE_LOGICS_IMM)
20147             return true;
20148         }
20149     }
20150
20151   if (prev_set
20152       && curr_set
20153       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
20154       && any_condjump_p (curr))
20155     {
20156       /* We're trying to match:
20157           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
20158           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
20159                                                          (const_int 0))
20160                                                  (label_ref ("SYM"))
20161                                                  (pc))  */
20162       if (SET_DEST (curr_set) == (pc_rtx)
20163           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
20164           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
20165           && REG_P (SET_DEST (prev_set))
20166           && REGNO (SET_DEST (prev_set))
20167              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
20168         {
20169           /* Fuse ALU operations followed by conditional branch instruction.  */
20170           switch (get_attr_type (prev))
20171             {
20172             case TYPE_ALU_IMM:
20173             case TYPE_ALU_SREG:
20174             case TYPE_ADC_REG:
20175             case TYPE_ADC_IMM:
20176             case TYPE_ADCS_REG:
20177             case TYPE_ADCS_IMM:
20178             case TYPE_LOGIC_REG:
20179             case TYPE_LOGIC_IMM:
20180             case TYPE_CSEL:
20181             case TYPE_ADR:
20182             case TYPE_MOV_IMM:
20183             case TYPE_SHIFT_REG:
20184             case TYPE_SHIFT_IMM:
20185             case TYPE_BFM:
20186             case TYPE_RBIT:
20187             case TYPE_REV:
20188             case TYPE_EXTEND:
20189               return true;
20190
20191             default:;
20192             }
20193         }
20194     }
20195
20196   return false;
20197 }
20198
20199 /* Return true iff the instruction fusion described by OP is enabled.  */
20200
20201 bool
20202 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
20203 {
20204   return (aarch64_tune_params.fusible_ops & op) != 0;
20205 }
20206
20207 /* If MEM is in the form of [base+offset], extract the two parts
20208    of address and set to BASE and OFFSET, otherwise return false
20209    after clearing BASE and OFFSET.  */
20210
20211 bool
20212 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
20213 {
20214   rtx addr;
20215
20216   gcc_assert (MEM_P (mem));
20217
20218   addr = XEXP (mem, 0);
20219
20220   if (REG_P (addr))
20221     {
20222       *base = addr;
20223       *offset = const0_rtx;
20224       return true;
20225     }
20226
20227   if (GET_CODE (addr) == PLUS
20228       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
20229     {
20230       *base = XEXP (addr, 0);
20231       *offset = XEXP (addr, 1);
20232       return true;
20233     }
20234
20235   *base = NULL_RTX;
20236   *offset = NULL_RTX;
20237
20238   return false;
20239 }
20240
20241 /* Types for scheduling fusion.  */
20242 enum sched_fusion_type
20243 {
20244   SCHED_FUSION_NONE = 0,
20245   SCHED_FUSION_LD_SIGN_EXTEND,
20246   SCHED_FUSION_LD_ZERO_EXTEND,
20247   SCHED_FUSION_LD,
20248   SCHED_FUSION_ST,
20249   SCHED_FUSION_NUM
20250 };
20251
20252 /* If INSN is a load or store of address in the form of [base+offset],
20253    extract the two parts and set to BASE and OFFSET.  Return scheduling
20254    fusion type this INSN is.  */
20255
20256 static enum sched_fusion_type
20257 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
20258 {
20259   rtx x, dest, src;
20260   enum sched_fusion_type fusion = SCHED_FUSION_LD;
20261
20262   gcc_assert (INSN_P (insn));
20263   x = PATTERN (insn);
20264   if (GET_CODE (x) != SET)
20265     return SCHED_FUSION_NONE;
20266
20267   src = SET_SRC (x);
20268   dest = SET_DEST (x);
20269
20270   machine_mode dest_mode = GET_MODE (dest);
20271
20272   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
20273     return SCHED_FUSION_NONE;
20274
20275   if (GET_CODE (src) == SIGN_EXTEND)
20276     {
20277       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
20278       src = XEXP (src, 0);
20279       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
20280         return SCHED_FUSION_NONE;
20281     }
20282   else if (GET_CODE (src) == ZERO_EXTEND)
20283     {
20284       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
20285       src = XEXP (src, 0);
20286       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
20287         return SCHED_FUSION_NONE;
20288     }
20289
20290   if (GET_CODE (src) == MEM && REG_P (dest))
20291     extract_base_offset_in_addr (src, base, offset);
20292   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
20293     {
20294       fusion = SCHED_FUSION_ST;
20295       extract_base_offset_in_addr (dest, base, offset);
20296     }
20297   else
20298     return SCHED_FUSION_NONE;
20299
20300   if (*base == NULL_RTX || *offset == NULL_RTX)
20301     fusion = SCHED_FUSION_NONE;
20302
20303   return fusion;
20304 }
20305
20306 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
20307
20308    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
20309    and PRI are only calculated for these instructions.  For other instruction,
20310    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
20311    type instruction fusion can be added by returning different priorities.
20312
20313    It's important that irrelevant instructions get the largest FUSION_PRI.  */
20314
20315 static void
20316 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
20317                                int *fusion_pri, int *pri)
20318 {
20319   int tmp, off_val;
20320   rtx base, offset;
20321   enum sched_fusion_type fusion;
20322
20323   gcc_assert (INSN_P (insn));
20324
20325   tmp = max_pri - 1;
20326   fusion = fusion_load_store (insn, &base, &offset);
20327   if (fusion == SCHED_FUSION_NONE)
20328     {
20329       *pri = tmp;
20330       *fusion_pri = tmp;
20331       return;
20332     }
20333
20334   /* Set FUSION_PRI according to fusion type and base register.  */
20335   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
20336
20337   /* Calculate PRI.  */
20338   tmp /= 2;
20339
20340   /* INSN with smaller offset goes first.  */
20341   off_val = (int)(INTVAL (offset));
20342   if (off_val >= 0)
20343     tmp -= (off_val & 0xfffff);
20344   else
20345     tmp += ((- off_val) & 0xfffff);
20346
20347   *pri = tmp;
20348   return;
20349 }
20350
20351 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
20352    Adjust priority of sha1h instructions so they are scheduled before
20353    other SHA1 instructions.  */
20354
20355 static int
20356 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
20357 {
20358   rtx x = PATTERN (insn);
20359
20360   if (GET_CODE (x) == SET)
20361     {
20362       x = SET_SRC (x);
20363
20364       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
20365         return priority + 10;
20366     }
20367
20368   return priority;
20369 }
20370
20371 /* Given OPERANDS of consecutive load/store, check if we can merge
20372    them into ldp/stp.  LOAD is true if they are load instructions.
20373    MODE is the mode of memory operands.  */
20374
20375 bool
20376 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
20377                                 machine_mode mode)
20378 {
20379   HOST_WIDE_INT offval_1, offval_2, msize;
20380   enum reg_class rclass_1, rclass_2;
20381   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
20382
20383   if (load)
20384     {
20385       mem_1 = operands[1];
20386       mem_2 = operands[3];
20387       reg_1 = operands[0];
20388       reg_2 = operands[2];
20389       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
20390       if (REGNO (reg_1) == REGNO (reg_2))
20391         return false;
20392     }
20393   else
20394     {
20395       mem_1 = operands[0];
20396       mem_2 = operands[2];
20397       reg_1 = operands[1];
20398       reg_2 = operands[3];
20399     }
20400
20401   /* The mems cannot be volatile.  */
20402   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
20403     return false;
20404
20405   /* If we have SImode and slow unaligned ldp,
20406      check the alignment to be at least 8 byte. */
20407   if (mode == SImode
20408       && (aarch64_tune_params.extra_tuning_flags
20409           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
20410       && !optimize_size
20411       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
20412     return false;
20413
20414   /* Check if the addresses are in the form of [base+offset].  */
20415   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
20416   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
20417     return false;
20418   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
20419   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
20420     return false;
20421
20422   /* Check if the bases are same.  */
20423   if (!rtx_equal_p (base_1, base_2))
20424     return false;
20425
20426   /* The operands must be of the same size.  */
20427   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
20428                          GET_MODE_SIZE (GET_MODE (mem_2))));
20429
20430   offval_1 = INTVAL (offset_1);
20431   offval_2 = INTVAL (offset_2);
20432   /* We should only be trying this for fixed-sized modes.  There is no
20433      SVE LDP/STP instruction.  */
20434   msize = GET_MODE_SIZE (mode).to_constant ();
20435   /* Check if the offsets are consecutive.  */
20436   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
20437     return false;
20438
20439   /* Check if the addresses are clobbered by load.  */
20440   if (load)
20441     {
20442       if (reg_mentioned_p (reg_1, mem_1))
20443         return false;
20444
20445       /* In increasing order, the last load can clobber the address.  */
20446       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
20447         return false;
20448     }
20449
20450   /* One of the memory accesses must be a mempair operand.
20451      If it is not the first one, they need to be swapped by the
20452      peephole.  */
20453   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
20454        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
20455     return false;
20456
20457   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
20458     rclass_1 = FP_REGS;
20459   else
20460     rclass_1 = GENERAL_REGS;
20461
20462   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
20463     rclass_2 = FP_REGS;
20464   else
20465     rclass_2 = GENERAL_REGS;
20466
20467   /* Check if the registers are of same class.  */
20468   if (rclass_1 != rclass_2)
20469     return false;
20470
20471   return true;
20472 }
20473
20474 /* Given OPERANDS of consecutive load/store that can be merged,
20475    swap them if they are not in ascending order.  */
20476 void
20477 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
20478 {
20479   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
20480   HOST_WIDE_INT offval_1, offval_2;
20481
20482   if (load)
20483     {
20484       mem_1 = operands[1];
20485       mem_2 = operands[3];
20486     }
20487   else
20488     {
20489       mem_1 = operands[0];
20490       mem_2 = operands[2];
20491     }
20492
20493   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
20494   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
20495
20496   offval_1 = INTVAL (offset_1);
20497   offval_2 = INTVAL (offset_2);
20498
20499   if (offval_1 > offval_2)
20500     {
20501       /* Irrespective of whether this is a load or a store,
20502          we do the same swap.  */
20503       std::swap (operands[0], operands[2]);
20504       std::swap (operands[1], operands[3]);
20505     }
20506 }
20507
20508 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
20509    comparison between the two.  */
20510 int
20511 aarch64_host_wide_int_compare (const void *x, const void *y)
20512 {
20513   return wi::cmps (* ((const HOST_WIDE_INT *) x),
20514                    * ((const HOST_WIDE_INT *) y));
20515 }
20516
20517 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
20518    other pointing to a REG rtx containing an offset, compare the offsets
20519    of the two pairs.
20520
20521    Return:
20522
20523         1 iff offset (X) > offset (Y)
20524         0 iff offset (X) == offset (Y)
20525         -1 iff offset (X) < offset (Y)  */
20526 int
20527 aarch64_ldrstr_offset_compare (const void *x, const void *y)
20528 {
20529   const rtx * operands_1 = (const rtx *) x;
20530   const rtx * operands_2 = (const rtx *) y;
20531   rtx mem_1, mem_2, base, offset_1, offset_2;
20532
20533   if (MEM_P (operands_1[0]))
20534     mem_1 = operands_1[0];
20535   else
20536     mem_1 = operands_1[1];
20537
20538   if (MEM_P (operands_2[0]))
20539     mem_2 = operands_2[0];
20540   else
20541     mem_2 = operands_2[1];
20542
20543   /* Extract the offsets.  */
20544   extract_base_offset_in_addr (mem_1, &base, &offset_1);
20545   extract_base_offset_in_addr (mem_2, &base, &offset_2);
20546
20547   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
20548
20549   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
20550 }
20551
20552 /* Given OPERANDS of consecutive load/store, check if we can merge
20553    them into ldp/stp by adjusting the offset.  LOAD is true if they
20554    are load instructions.  MODE is the mode of memory operands.
20555
20556    Given below consecutive stores:
20557
20558      str  w1, [xb, 0x100]
20559      str  w1, [xb, 0x104]
20560      str  w1, [xb, 0x108]
20561      str  w1, [xb, 0x10c]
20562
20563    Though the offsets are out of the range supported by stp, we can
20564    still pair them after adjusting the offset, like:
20565
20566      add  scratch, xb, 0x100
20567      stp  w1, w1, [scratch]
20568      stp  w1, w1, [scratch, 0x8]
20569
20570    The peephole patterns detecting this opportunity should guarantee
20571    the scratch register is avaliable.  */
20572
20573 bool
20574 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
20575                                        scalar_mode mode)
20576 {
20577   const int num_insns = 4;
20578   enum reg_class rclass;
20579   HOST_WIDE_INT offvals[num_insns], msize;
20580   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
20581
20582   if (load)
20583     {
20584       for (int i = 0; i < num_insns; i++)
20585         {
20586           reg[i] = operands[2 * i];
20587           mem[i] = operands[2 * i + 1];
20588
20589           gcc_assert (REG_P (reg[i]));
20590         }
20591
20592       /* Do not attempt to merge the loads if the loads clobber each other.  */
20593       for (int i = 0; i < 8; i += 2)
20594         for (int j = i + 2; j < 8; j += 2)
20595           if (reg_overlap_mentioned_p (operands[i], operands[j]))
20596             return false;
20597     }
20598   else
20599     for (int i = 0; i < num_insns; i++)
20600       {
20601         mem[i] = operands[2 * i];
20602         reg[i] = operands[2 * i + 1];
20603       }
20604
20605   /* Skip if memory operand is by itself valid for ldp/stp.  */
20606   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
20607     return false;
20608
20609   for (int i = 0; i < num_insns; i++)
20610     {
20611       /* The mems cannot be volatile.  */
20612       if (MEM_VOLATILE_P (mem[i]))
20613         return false;
20614
20615       /* Check if the addresses are in the form of [base+offset].  */
20616       extract_base_offset_in_addr (mem[i], base + i, offset + i);
20617       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
20618         return false;
20619     }
20620
20621   /* Check if the registers are of same class.  */
20622   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
20623     ? FP_REGS : GENERAL_REGS;
20624
20625   for (int i = 1; i < num_insns; i++)
20626     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
20627       {
20628         if (rclass != FP_REGS)
20629           return false;
20630       }
20631     else
20632       {
20633         if (rclass != GENERAL_REGS)
20634           return false;
20635       }
20636
20637   /* Only the last register in the order in which they occur
20638      may be clobbered by the load.  */
20639   if (rclass == GENERAL_REGS && load)
20640     for (int i = 0; i < num_insns - 1; i++)
20641       if (reg_mentioned_p (reg[i], mem[i]))
20642         return false;
20643
20644   /* Check if the bases are same.  */
20645   for (int i = 0; i < num_insns - 1; i++)
20646     if (!rtx_equal_p (base[i], base[i + 1]))
20647       return false;
20648
20649   for (int i = 0; i < num_insns; i++)
20650     offvals[i] = INTVAL (offset[i]);
20651
20652   msize = GET_MODE_SIZE (mode);
20653
20654   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
20655   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
20656          aarch64_host_wide_int_compare);
20657
20658   if (!(offvals[1] == offvals[0] + msize
20659         && offvals[3] == offvals[2] + msize))
20660     return false;
20661
20662   /* Check that offsets are within range of each other.  The ldp/stp
20663      instructions have 7 bit immediate offsets, so use 0x80.  */
20664   if (offvals[2] - offvals[0] >= msize * 0x80)
20665     return false;
20666
20667   /* The offsets must be aligned with respect to each other.  */
20668   if (offvals[0] % msize != offvals[2] % msize)
20669     return false;
20670
20671   /* If we have SImode and slow unaligned ldp,
20672      check the alignment to be at least 8 byte. */
20673   if (mode == SImode
20674       && (aarch64_tune_params.extra_tuning_flags
20675           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
20676       && !optimize_size
20677       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
20678     return false;
20679
20680   return true;
20681 }
20682
20683 /* Given OPERANDS of consecutive load/store, this function pairs them
20684    into LDP/STP after adjusting the offset.  It depends on the fact
20685    that the operands can be sorted so the offsets are correct for STP.
20686    MODE is the mode of memory operands.  CODE is the rtl operator
20687    which should be applied to all memory operands, it's SIGN_EXTEND,
20688    ZERO_EXTEND or UNKNOWN.  */
20689
20690 bool
20691 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
20692                              scalar_mode mode, RTX_CODE code)
20693 {
20694   rtx base, offset_1, offset_3, t1, t2;
20695   rtx mem_1, mem_2, mem_3, mem_4;
20696   rtx temp_operands[8];
20697   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
20698                 stp_off_upper_limit, stp_off_lower_limit, msize;
20699
20700   /* We make changes on a copy as we may still bail out.  */
20701   for (int i = 0; i < 8; i ++)
20702     temp_operands[i] = operands[i];
20703
20704   /* Sort the operands.  */
20705   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
20706
20707   /* Copy the memory operands so that if we have to bail for some
20708      reason the original addresses are unchanged.  */
20709   if (load)
20710     {
20711       mem_1 = copy_rtx (temp_operands[1]);
20712       mem_2 = copy_rtx (temp_operands[3]);
20713       mem_3 = copy_rtx (temp_operands[5]);
20714       mem_4 = copy_rtx (temp_operands[7]);
20715     }
20716   else
20717     {
20718       mem_1 = copy_rtx (temp_operands[0]);
20719       mem_2 = copy_rtx (temp_operands[2]);
20720       mem_3 = copy_rtx (temp_operands[4]);
20721       mem_4 = copy_rtx (temp_operands[6]);
20722       gcc_assert (code == UNKNOWN);
20723     }
20724
20725   extract_base_offset_in_addr (mem_1, &base, &offset_1);
20726   extract_base_offset_in_addr (mem_3, &base, &offset_3);
20727   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
20728               && offset_3 != NULL_RTX);
20729
20730   /* Adjust offset so it can fit in LDP/STP instruction.  */
20731   msize = GET_MODE_SIZE (mode);
20732   stp_off_upper_limit = msize * (0x40 - 1);
20733   stp_off_lower_limit = - msize * 0x40;
20734
20735   off_val_1 = INTVAL (offset_1);
20736   off_val_3 = INTVAL (offset_3);
20737
20738   /* The base offset is optimally half way between the two STP/LDP offsets.  */
20739   if (msize <= 4)
20740     base_off = (off_val_1 + off_val_3) / 2;
20741   else
20742     /* However, due to issues with negative LDP/STP offset generation for
20743        larger modes, for DF, DI and vector modes. we must not use negative
20744        addresses smaller than 9 signed unadjusted bits can store.  This
20745        provides the most range in this case.  */
20746     base_off = off_val_1;
20747
20748   /* Adjust the base so that it is aligned with the addresses but still
20749      optimal.  */
20750   if (base_off % msize != off_val_1 % msize)
20751     /* Fix the offset, bearing in mind we want to make it bigger not
20752        smaller.  */
20753     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
20754   else if (msize <= 4)
20755     /* The negative range of LDP/STP is one larger than the positive range.  */
20756     base_off += msize;
20757
20758   /* Check if base offset is too big or too small.  We can attempt to resolve
20759      this issue by setting it to the maximum value and seeing if the offsets
20760      still fit.  */
20761   if (base_off >= 0x1000)
20762     {
20763       base_off = 0x1000 - 1;
20764       /* We must still make sure that the base offset is aligned with respect
20765          to the address.  But it may may not be made any bigger.  */
20766       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
20767     }
20768
20769   /* Likewise for the case where the base is too small.  */
20770   if (base_off <= -0x1000)
20771     {
20772       base_off = -0x1000 + 1;
20773       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
20774     }
20775
20776   /* Offset of the first STP/LDP.  */
20777   new_off_1 = off_val_1 - base_off;
20778
20779   /* Offset of the second STP/LDP.  */
20780   new_off_3 = off_val_3 - base_off;
20781
20782   /* The offsets must be within the range of the LDP/STP instructions.  */
20783   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
20784       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
20785     return false;
20786
20787   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
20788                                                   new_off_1), true);
20789   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
20790                                                   new_off_1 + msize), true);
20791   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
20792                                                   new_off_3), true);
20793   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
20794                                                   new_off_3 + msize), true);
20795
20796   if (!aarch64_mem_pair_operand (mem_1, mode)
20797       || !aarch64_mem_pair_operand (mem_3, mode))
20798     return false;
20799
20800   if (code == ZERO_EXTEND)
20801     {
20802       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
20803       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
20804       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
20805       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
20806     }
20807   else if (code == SIGN_EXTEND)
20808     {
20809       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
20810       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
20811       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
20812       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
20813     }
20814
20815   if (load)
20816     {
20817       operands[0] = temp_operands[0];
20818       operands[1] = mem_1;
20819       operands[2] = temp_operands[2];
20820       operands[3] = mem_2;
20821       operands[4] = temp_operands[4];
20822       operands[5] = mem_3;
20823       operands[6] = temp_operands[6];
20824       operands[7] = mem_4;
20825     }
20826   else
20827     {
20828       operands[0] = mem_1;
20829       operands[1] = temp_operands[1];
20830       operands[2] = mem_2;
20831       operands[3] = temp_operands[3];
20832       operands[4] = mem_3;
20833       operands[5] = temp_operands[5];
20834       operands[6] = mem_4;
20835       operands[7] = temp_operands[7];
20836     }
20837
20838   /* Emit adjusting instruction.  */
20839   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
20840   /* Emit ldp/stp instructions.  */
20841   t1 = gen_rtx_SET (operands[0], operands[1]);
20842   t2 = gen_rtx_SET (operands[2], operands[3]);
20843   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
20844   t1 = gen_rtx_SET (operands[4], operands[5]);
20845   t2 = gen_rtx_SET (operands[6], operands[7]);
20846   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
20847   return true;
20848 }
20849
20850 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
20851    it isn't worth branching around empty masked ops (including masked
20852    stores).  */
20853
20854 static bool
20855 aarch64_empty_mask_is_expensive (unsigned)
20856 {
20857   return false;
20858 }
20859
20860 /* Return 1 if pseudo register should be created and used to hold
20861    GOT address for PIC code.  */
20862
20863 bool
20864 aarch64_use_pseudo_pic_reg (void)
20865 {
20866   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
20867 }
20868
20869 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
20870
20871 static int
20872 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
20873 {
20874   switch (XINT (x, 1))
20875     {
20876     case UNSPEC_GOTSMALLPIC:
20877     case UNSPEC_GOTSMALLPIC28K:
20878     case UNSPEC_GOTTINYPIC:
20879       return 0;
20880     default:
20881       break;
20882     }
20883
20884   return default_unspec_may_trap_p (x, flags);
20885 }
20886
20887
20888 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
20889    return the log2 of that value.  Otherwise return -1.  */
20890
20891 int
20892 aarch64_fpconst_pow_of_2 (rtx x)
20893 {
20894   const REAL_VALUE_TYPE *r;
20895
20896   if (!CONST_DOUBLE_P (x))
20897     return -1;
20898
20899   r = CONST_DOUBLE_REAL_VALUE (x);
20900
20901   if (REAL_VALUE_NEGATIVE (*r)
20902       || REAL_VALUE_ISNAN (*r)
20903       || REAL_VALUE_ISINF (*r)
20904       || !real_isinteger (r, DFmode))
20905     return -1;
20906
20907   return exact_log2 (real_to_integer (r));
20908 }
20909
20910 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
20911    power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
20912    return n. Otherwise return -1.  */
20913
20914 int
20915 aarch64_fpconst_pow2_recip (rtx x)
20916 {
20917   REAL_VALUE_TYPE r0;
20918
20919   if (!CONST_DOUBLE_P (x))
20920     return -1;
20921
20922   r0 = *CONST_DOUBLE_REAL_VALUE (x);
20923   if (exact_real_inverse (DFmode, &r0)
20924       && !REAL_VALUE_NEGATIVE (r0))
20925     {
20926         int ret = exact_log2 (real_to_integer (&r0));
20927         if (ret >= 1 && ret <= 32)
20928             return ret;
20929     }
20930   return -1;
20931 }
20932
20933 /* If X is a vector of equal CONST_DOUBLE values and that value is
20934    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
20935
20936 int
20937 aarch64_vec_fpconst_pow_of_2 (rtx x)
20938 {
20939   int nelts;
20940   if (GET_CODE (x) != CONST_VECTOR
20941       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
20942     return -1;
20943
20944   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
20945     return -1;
20946
20947   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
20948   if (firstval <= 0)
20949     return -1;
20950
20951   for (int i = 1; i < nelts; i++)
20952     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
20953       return -1;
20954
20955   return firstval;
20956 }
20957
20958 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
20959    to float.
20960
20961    __fp16 always promotes through this hook.
20962    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
20963    through the generic excess precision logic rather than here.  */
20964
20965 static tree
20966 aarch64_promoted_type (const_tree t)
20967 {
20968   if (SCALAR_FLOAT_TYPE_P (t)
20969       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
20970     return float_type_node;
20971
20972   return NULL_TREE;
20973 }
20974
20975 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
20976
20977 static bool
20978 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
20979                            optimization_type opt_type)
20980 {
20981   switch (op)
20982     {
20983     case rsqrt_optab:
20984       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
20985
20986     default:
20987       return true;
20988     }
20989 }
20990
20991 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
20992
20993 static unsigned int
20994 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
20995                                         int *offset)
20996 {
20997   /* Polynomial invariant 1 == (VG / 2) - 1.  */
20998   gcc_assert (i == 1);
20999   *factor = 2;
21000   *offset = 1;
21001   return AARCH64_DWARF_VG;
21002 }
21003
21004 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
21005    if MODE is HFmode, and punt to the generic implementation otherwise.  */
21006
21007 static bool
21008 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
21009 {
21010   return (mode == HFmode
21011           ? true
21012           : default_libgcc_floating_mode_supported_p (mode));
21013 }
21014
21015 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
21016    if MODE is HFmode, and punt to the generic implementation otherwise.  */
21017
21018 static bool
21019 aarch64_scalar_mode_supported_p (scalar_mode mode)
21020 {
21021   return (mode == HFmode
21022           ? true
21023           : default_scalar_mode_supported_p (mode));
21024 }
21025
21026 /* Set the value of FLT_EVAL_METHOD.
21027    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
21028
21029     0: evaluate all operations and constants, whose semantic type has at
21030        most the range and precision of type float, to the range and
21031        precision of float; evaluate all other operations and constants to
21032        the range and precision of the semantic type;
21033
21034     N, where _FloatN is a supported interchange floating type
21035        evaluate all operations and constants, whose semantic type has at
21036        most the range and precision of _FloatN type, to the range and
21037        precision of the _FloatN type; evaluate all other operations and
21038        constants to the range and precision of the semantic type;
21039
21040    If we have the ARMv8.2-A extensions then we support _Float16 in native
21041    precision, so we should set this to 16.  Otherwise, we support the type,
21042    but want to evaluate expressions in float precision, so set this to
21043    0.  */
21044
21045 static enum flt_eval_method
21046 aarch64_excess_precision (enum excess_precision_type type)
21047 {
21048   switch (type)
21049     {
21050       case EXCESS_PRECISION_TYPE_FAST:
21051       case EXCESS_PRECISION_TYPE_STANDARD:
21052         /* We can calculate either in 16-bit range and precision or
21053            32-bit range and precision.  Make that decision based on whether
21054            we have native support for the ARMv8.2-A 16-bit floating-point
21055            instructions or not.  */
21056         return (TARGET_FP_F16INST
21057                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
21058                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
21059       case EXCESS_PRECISION_TYPE_IMPLICIT:
21060         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
21061       default:
21062         gcc_unreachable ();
21063     }
21064   return FLT_EVAL_METHOD_UNPREDICTABLE;
21065 }
21066
21067 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
21068    scheduled for speculative execution.  Reject the long-running division
21069    and square-root instructions.  */
21070
21071 static bool
21072 aarch64_sched_can_speculate_insn (rtx_insn *insn)
21073 {
21074   switch (get_attr_type (insn))
21075     {
21076       case TYPE_SDIV:
21077       case TYPE_UDIV:
21078       case TYPE_FDIVS:
21079       case TYPE_FDIVD:
21080       case TYPE_FSQRTS:
21081       case TYPE_FSQRTD:
21082       case TYPE_NEON_FP_SQRT_S:
21083       case TYPE_NEON_FP_SQRT_D:
21084       case TYPE_NEON_FP_SQRT_S_Q:
21085       case TYPE_NEON_FP_SQRT_D_Q:
21086       case TYPE_NEON_FP_DIV_S:
21087       case TYPE_NEON_FP_DIV_D:
21088       case TYPE_NEON_FP_DIV_S_Q:
21089       case TYPE_NEON_FP_DIV_D_Q:
21090         return false;
21091       default:
21092         return true;
21093     }
21094 }
21095
21096 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
21097
21098 static int
21099 aarch64_compute_pressure_classes (reg_class *classes)
21100 {
21101   int i = 0;
21102   classes[i++] = GENERAL_REGS;
21103   classes[i++] = FP_REGS;
21104   /* PR_REGS isn't a useful pressure class because many predicate pseudo
21105      registers need to go in PR_LO_REGS at some point during their
21106      lifetime.  Splitting it into two halves has the effect of making
21107      all predicates count against PR_LO_REGS, so that we try whenever
21108      possible to restrict the number of live predicates to 8.  This
21109      greatly reduces the amount of spilling in certain loops.  */
21110   classes[i++] = PR_LO_REGS;
21111   classes[i++] = PR_HI_REGS;
21112   return i;
21113 }
21114
21115 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
21116
21117 static bool
21118 aarch64_can_change_mode_class (machine_mode from,
21119                                machine_mode to, reg_class_t)
21120 {
21121   if (BYTES_BIG_ENDIAN)
21122     {
21123       bool from_sve_p = aarch64_sve_data_mode_p (from);
21124       bool to_sve_p = aarch64_sve_data_mode_p (to);
21125
21126       /* Don't allow changes between SVE data modes and non-SVE modes.
21127          See the comment at the head of aarch64-sve.md for details.  */
21128       if (from_sve_p != to_sve_p)
21129         return false;
21130
21131       /* Don't allow changes in element size: lane 0 of the new vector
21132          would not then be lane 0 of the old vector.  See the comment
21133          above aarch64_maybe_expand_sve_subreg_move for a more detailed
21134          description.
21135
21136          In the worst case, this forces a register to be spilled in
21137          one mode and reloaded in the other, which handles the
21138          endianness correctly.  */
21139       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
21140         return false;
21141     }
21142   return true;
21143 }
21144
21145 /* Implement TARGET_EARLY_REMAT_MODES.  */
21146
21147 static void
21148 aarch64_select_early_remat_modes (sbitmap modes)
21149 {
21150   /* SVE values are not normally live across a call, so it should be
21151      worth doing early rematerialization even in VL-specific mode.  */
21152   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
21153     if (aarch64_sve_mode_p ((machine_mode) i))
21154       bitmap_set_bit (modes, i);
21155 }
21156
21157 /* Override the default target speculation_safe_value.  */
21158 static rtx
21159 aarch64_speculation_safe_value (machine_mode mode,
21160                                 rtx result, rtx val, rtx failval)
21161 {
21162   /* Maybe we should warn if falling back to hard barriers.  They are
21163      likely to be noticably more expensive than the alternative below.  */
21164   if (!aarch64_track_speculation)
21165     return default_speculation_safe_value (mode, result, val, failval);
21166
21167   if (!REG_P (val))
21168     val = copy_to_mode_reg (mode, val);
21169
21170   if (!aarch64_reg_or_zero (failval, mode))
21171     failval = copy_to_mode_reg (mode, failval);
21172
21173   emit_insn (gen_despeculate_copy (mode, result, val, failval));
21174   return result;
21175 }
21176
21177 /* Implement TARGET_ESTIMATED_POLY_VALUE.
21178    Look into the tuning structure for an estimate.
21179    VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
21180    Advanced SIMD 128 bits.  */
21181
21182 static HOST_WIDE_INT
21183 aarch64_estimated_poly_value (poly_int64 val)
21184 {
21185   enum aarch64_sve_vector_bits_enum width_source
21186     = aarch64_tune_params.sve_width;
21187
21188   /* If we still don't have an estimate, use the default.  */
21189   if (width_source == SVE_SCALABLE)
21190     return default_estimated_poly_value (val);
21191
21192   HOST_WIDE_INT over_128 = width_source - 128;
21193   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
21194 }
21195
21196
21197 /* Return true for types that could be supported as SIMD return or
21198    argument types.  */
21199
21200 static bool
21201 supported_simd_type (tree t)
21202 {
21203   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
21204     {
21205       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
21206       return s == 1 || s == 2 || s == 4 || s == 8;
21207     }
21208   return false;
21209 }
21210
21211 /* Return true for types that currently are supported as SIMD return
21212    or argument types.  */
21213
21214 static bool
21215 currently_supported_simd_type (tree t, tree b)
21216 {
21217   if (COMPLEX_FLOAT_TYPE_P (t))
21218     return false;
21219
21220   if (TYPE_SIZE (t) != TYPE_SIZE (b))
21221     return false;
21222
21223   return supported_simd_type (t);
21224 }
21225
21226 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
21227
21228 static int
21229 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
21230                                         struct cgraph_simd_clone *clonei,
21231                                         tree base_type, int num)
21232 {
21233   tree t, ret_type, arg_type;
21234   unsigned int elt_bits, vec_bits, count;
21235
21236   if (!TARGET_SIMD)
21237     return 0;
21238
21239   if (clonei->simdlen
21240       && (clonei->simdlen < 2
21241           || clonei->simdlen > 1024
21242           || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
21243     {
21244       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21245                   "unsupported simdlen %d", clonei->simdlen);
21246       return 0;
21247     }
21248
21249   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
21250   if (TREE_CODE (ret_type) != VOID_TYPE
21251       && !currently_supported_simd_type (ret_type, base_type))
21252     {
21253       if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
21254         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21255                     "GCC does not currently support mixed size types "
21256                     "for %<simd%> functions");
21257       else if (supported_simd_type (ret_type))
21258         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21259                     "GCC does not currently support return type %qT "
21260                     "for %<simd%> functions", ret_type);
21261       else
21262         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21263                     "unsupported return type %qT for %<simd%> functions",
21264                     ret_type);
21265       return 0;
21266     }
21267
21268   for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
21269     {
21270       arg_type = TREE_TYPE (t);
21271
21272       if (!currently_supported_simd_type (arg_type, base_type))
21273         {
21274           if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
21275             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21276                         "GCC does not currently support mixed size types "
21277                         "for %<simd%> functions");
21278           else
21279             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21280                         "GCC does not currently support argument type %qT "
21281                         "for %<simd%> functions", arg_type);
21282           return 0;
21283         }
21284     }
21285
21286   clonei->vecsize_mangle = 'n';
21287   clonei->mask_mode = VOIDmode;
21288   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
21289   if (clonei->simdlen == 0)
21290     {
21291       count = 2;
21292       vec_bits = (num == 0 ? 64 : 128);
21293       clonei->simdlen = vec_bits / elt_bits;
21294     }
21295   else
21296     {
21297       count = 1;
21298       vec_bits = clonei->simdlen * elt_bits;
21299       if (vec_bits != 64 && vec_bits != 128)
21300         {
21301           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21302                       "GCC does not currently support simdlen %d for type %qT",
21303                       clonei->simdlen, base_type);
21304           return 0;
21305         }
21306     }
21307   clonei->vecsize_int = vec_bits;
21308   clonei->vecsize_float = vec_bits;
21309   return count;
21310 }
21311
21312 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
21313
21314 static void
21315 aarch64_simd_clone_adjust (struct cgraph_node *node)
21316 {
21317   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
21318      use the correct ABI.  */
21319
21320   tree t = TREE_TYPE (node->decl);
21321   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
21322                                         TYPE_ATTRIBUTES (t));
21323 }
21324
21325 /* Implement TARGET_SIMD_CLONE_USABLE.  */
21326
21327 static int
21328 aarch64_simd_clone_usable (struct cgraph_node *node)
21329 {
21330   switch (node->simdclone->vecsize_mangle)
21331     {
21332     case 'n':
21333       if (!TARGET_SIMD)
21334         return -1;
21335       return 0;
21336     default:
21337       gcc_unreachable ();
21338     }
21339 }
21340
21341 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
21342
21343 static int
21344 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
21345 {
21346   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
21347       != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
21348     return 0;
21349   return 1;
21350 }
21351
21352 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
21353
21354 static const char *
21355 aarch64_get_multilib_abi_name (void)
21356 {
21357   if (TARGET_BIG_END)
21358     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
21359   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
21360 }
21361
21362 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
21363    global variable based guard use the default else
21364    return a null tree.  */
21365 static tree
21366 aarch64_stack_protect_guard (void)
21367 {
21368   if (aarch64_stack_protector_guard == SSP_GLOBAL)
21369     return default_stack_protect_guard ();
21370
21371   return NULL_TREE;
21372 }
21373
21374 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
21375    section at the end if needed.  */
21376 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND      0xc0000000
21377 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI      (1U << 0)
21378 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC      (1U << 1)
21379 void
21380 aarch64_file_end_indicate_exec_stack ()
21381 {
21382   file_end_indicate_exec_stack ();
21383
21384   unsigned feature_1_and = 0;
21385   if (aarch64_bti_enabled ())
21386     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
21387
21388   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
21389     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
21390
21391   if (feature_1_and)
21392     {
21393       /* Generate .note.gnu.property section.  */
21394       switch_to_section (get_section (".note.gnu.property",
21395                                       SECTION_NOTYPE, NULL));
21396
21397       /* PT_NOTE header: namesz, descsz, type.
21398          namesz = 4 ("GNU\0")
21399          descsz = 16 (Size of the program property array)
21400                   [(12 + padding) * Number of array elements]
21401          type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
21402       assemble_align (POINTER_SIZE);
21403       assemble_integer (GEN_INT (4), 4, 32, 1);
21404       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
21405       assemble_integer (GEN_INT (5), 4, 32, 1);
21406
21407       /* PT_NOTE name.  */
21408       assemble_string ("GNU", 4);
21409
21410       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
21411          type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
21412          datasz = 4
21413          data   = feature_1_and.  */
21414       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
21415       assemble_integer (GEN_INT (4), 4, 32, 1);
21416       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
21417
21418       /* Pad the size of the note to the required alignment.  */
21419       assemble_align (POINTER_SIZE);
21420     }
21421 }
21422 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
21423 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
21424 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
21425
21426 /* Target-specific selftests.  */
21427
21428 #if CHECKING_P
21429
21430 namespace selftest {
21431
21432 /* Selftest for the RTL loader.
21433    Verify that the RTL loader copes with a dump from
21434    print_rtx_function.  This is essentially just a test that class
21435    function_reader can handle a real dump, but it also verifies
21436    that lookup_reg_by_dump_name correctly handles hard regs.
21437    The presence of hard reg names in the dump means that the test is
21438    target-specific, hence it is in this file.  */
21439
21440 static void
21441 aarch64_test_loading_full_dump ()
21442 {
21443   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
21444
21445   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
21446
21447   rtx_insn *insn_1 = get_insn_by_uid (1);
21448   ASSERT_EQ (NOTE, GET_CODE (insn_1));
21449
21450   rtx_insn *insn_15 = get_insn_by_uid (15);
21451   ASSERT_EQ (INSN, GET_CODE (insn_15));
21452   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
21453
21454   /* Verify crtl->return_rtx.  */
21455   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
21456   ASSERT_EQ (0, REGNO (crtl->return_rtx));
21457   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
21458 }
21459
21460 /* Run all target-specific selftests.  */
21461
21462 static void
21463 aarch64_run_selftests (void)
21464 {
21465   aarch64_test_loading_full_dump ();
21466 }
21467
21468 } // namespace selftest
21469
21470 #endif /* #if CHECKING_P */
21471
21472 #undef TARGET_STACK_PROTECT_GUARD
21473 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
21474
21475 #undef TARGET_ADDRESS_COST
21476 #define TARGET_ADDRESS_COST aarch64_address_cost
21477
21478 /* This hook will determines whether unnamed bitfields affect the alignment
21479    of the containing structure.  The hook returns true if the structure
21480    should inherit the alignment requirements of an unnamed bitfield's
21481    type.  */
21482 #undef TARGET_ALIGN_ANON_BITFIELD
21483 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
21484
21485 #undef TARGET_ASM_ALIGNED_DI_OP
21486 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
21487
21488 #undef TARGET_ASM_ALIGNED_HI_OP
21489 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
21490
21491 #undef TARGET_ASM_ALIGNED_SI_OP
21492 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
21493
21494 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
21495 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
21496   hook_bool_const_tree_hwi_hwi_const_tree_true
21497
21498 #undef TARGET_ASM_FILE_START
21499 #define TARGET_ASM_FILE_START aarch64_start_file
21500
21501 #undef TARGET_ASM_OUTPUT_MI_THUNK
21502 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
21503
21504 #undef TARGET_ASM_SELECT_RTX_SECTION
21505 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
21506
21507 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
21508 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
21509
21510 #undef TARGET_BUILD_BUILTIN_VA_LIST
21511 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
21512
21513 #undef TARGET_CALLEE_COPIES
21514 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
21515
21516 #undef TARGET_CAN_ELIMINATE
21517 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
21518
21519 #undef TARGET_CAN_INLINE_P
21520 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
21521
21522 #undef TARGET_CANNOT_FORCE_CONST_MEM
21523 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
21524
21525 #undef TARGET_CASE_VALUES_THRESHOLD
21526 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
21527
21528 #undef TARGET_CONDITIONAL_REGISTER_USAGE
21529 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
21530
21531 /* Only the least significant bit is used for initialization guard
21532    variables.  */
21533 #undef TARGET_CXX_GUARD_MASK_BIT
21534 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
21535
21536 #undef TARGET_C_MODE_FOR_SUFFIX
21537 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
21538
21539 #ifdef TARGET_BIG_ENDIAN_DEFAULT
21540 #undef  TARGET_DEFAULT_TARGET_FLAGS
21541 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
21542 #endif
21543
21544 #undef TARGET_CLASS_MAX_NREGS
21545 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
21546
21547 #undef TARGET_BUILTIN_DECL
21548 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
21549
21550 #undef TARGET_BUILTIN_RECIPROCAL
21551 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
21552
21553 #undef TARGET_C_EXCESS_PRECISION
21554 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
21555
21556 #undef  TARGET_EXPAND_BUILTIN
21557 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
21558
21559 #undef TARGET_EXPAND_BUILTIN_VA_START
21560 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
21561
21562 #undef TARGET_FOLD_BUILTIN
21563 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
21564
21565 #undef TARGET_FUNCTION_ARG
21566 #define TARGET_FUNCTION_ARG aarch64_function_arg
21567
21568 #undef TARGET_FUNCTION_ARG_ADVANCE
21569 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
21570
21571 #undef TARGET_FUNCTION_ARG_BOUNDARY
21572 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
21573
21574 #undef TARGET_FUNCTION_ARG_PADDING
21575 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
21576
21577 #undef TARGET_GET_RAW_RESULT_MODE
21578 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
21579 #undef TARGET_GET_RAW_ARG_MODE
21580 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
21581
21582 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
21583 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
21584
21585 #undef TARGET_FUNCTION_VALUE
21586 #define TARGET_FUNCTION_VALUE aarch64_function_value
21587
21588 #undef TARGET_FUNCTION_VALUE_REGNO_P
21589 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
21590
21591 #undef TARGET_GIMPLE_FOLD_BUILTIN
21592 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
21593
21594 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
21595 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
21596
21597 #undef  TARGET_INIT_BUILTINS
21598 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
21599
21600 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
21601 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
21602   aarch64_ira_change_pseudo_allocno_class
21603
21604 #undef TARGET_LEGITIMATE_ADDRESS_P
21605 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
21606
21607 #undef TARGET_LEGITIMATE_CONSTANT_P
21608 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
21609
21610 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
21611 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
21612   aarch64_legitimize_address_displacement
21613
21614 #undef TARGET_LIBGCC_CMP_RETURN_MODE
21615 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
21616
21617 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
21618 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
21619 aarch64_libgcc_floating_mode_supported_p
21620
21621 #undef TARGET_MANGLE_TYPE
21622 #define TARGET_MANGLE_TYPE aarch64_mangle_type
21623
21624 #undef TARGET_MEMORY_MOVE_COST
21625 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
21626
21627 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
21628 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
21629
21630 #undef TARGET_MUST_PASS_IN_STACK
21631 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
21632
21633 /* This target hook should return true if accesses to volatile bitfields
21634    should use the narrowest mode possible.  It should return false if these
21635    accesses should use the bitfield container type.  */
21636 #undef TARGET_NARROW_VOLATILE_BITFIELD
21637 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
21638
21639 #undef  TARGET_OPTION_OVERRIDE
21640 #define TARGET_OPTION_OVERRIDE aarch64_override_options
21641
21642 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
21643 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
21644   aarch64_override_options_after_change
21645
21646 #undef TARGET_OPTION_SAVE
21647 #define TARGET_OPTION_SAVE aarch64_option_save
21648
21649 #undef TARGET_OPTION_RESTORE
21650 #define TARGET_OPTION_RESTORE aarch64_option_restore
21651
21652 #undef TARGET_OPTION_PRINT
21653 #define TARGET_OPTION_PRINT aarch64_option_print
21654
21655 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
21656 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
21657
21658 #undef TARGET_SET_CURRENT_FUNCTION
21659 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
21660
21661 #undef TARGET_PASS_BY_REFERENCE
21662 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
21663
21664 #undef TARGET_PREFERRED_RELOAD_CLASS
21665 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
21666
21667 #undef TARGET_SCHED_REASSOCIATION_WIDTH
21668 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
21669
21670 #undef TARGET_PROMOTED_TYPE
21671 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
21672
21673 #undef TARGET_SECONDARY_RELOAD
21674 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
21675
21676 #undef TARGET_SHIFT_TRUNCATION_MASK
21677 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
21678
21679 #undef TARGET_SETUP_INCOMING_VARARGS
21680 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
21681
21682 #undef TARGET_STRUCT_VALUE_RTX
21683 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
21684
21685 #undef TARGET_REGISTER_MOVE_COST
21686 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
21687
21688 #undef TARGET_RETURN_IN_MEMORY
21689 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
21690
21691 #undef TARGET_RETURN_IN_MSB
21692 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
21693
21694 #undef TARGET_RTX_COSTS
21695 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
21696
21697 #undef TARGET_SCALAR_MODE_SUPPORTED_P
21698 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
21699
21700 #undef TARGET_SCHED_ISSUE_RATE
21701 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
21702
21703 #undef TARGET_SCHED_VARIABLE_ISSUE
21704 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
21705
21706 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
21707 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
21708   aarch64_sched_first_cycle_multipass_dfa_lookahead
21709
21710 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
21711 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
21712   aarch64_first_cycle_multipass_dfa_lookahead_guard
21713
21714 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
21715 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
21716   aarch64_get_separate_components
21717
21718 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
21719 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
21720   aarch64_components_for_bb
21721
21722 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
21723 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
21724   aarch64_disqualify_components
21725
21726 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
21727 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
21728   aarch64_emit_prologue_components
21729
21730 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
21731 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
21732   aarch64_emit_epilogue_components
21733
21734 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
21735 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
21736   aarch64_set_handled_components
21737
21738 #undef TARGET_TRAMPOLINE_INIT
21739 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
21740
21741 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
21742 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
21743
21744 #undef TARGET_VECTOR_MODE_SUPPORTED_P
21745 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
21746
21747 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
21748 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
21749   aarch64_builtin_support_vector_misalignment
21750
21751 #undef TARGET_ARRAY_MODE
21752 #define TARGET_ARRAY_MODE aarch64_array_mode
21753
21754 #undef TARGET_ARRAY_MODE_SUPPORTED_P
21755 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
21756
21757 #undef TARGET_VECTORIZE_ADD_STMT_COST
21758 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
21759
21760 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
21761 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
21762   aarch64_builtin_vectorization_cost
21763
21764 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
21765 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
21766
21767 #undef TARGET_VECTORIZE_BUILTINS
21768 #define TARGET_VECTORIZE_BUILTINS
21769
21770 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
21771 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
21772   aarch64_builtin_vectorized_function
21773
21774 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
21775 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
21776   aarch64_autovectorize_vector_sizes
21777
21778 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
21779 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
21780   aarch64_atomic_assign_expand_fenv
21781
21782 /* Section anchor support.  */
21783
21784 #undef TARGET_MIN_ANCHOR_OFFSET
21785 #define TARGET_MIN_ANCHOR_OFFSET -256
21786
21787 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
21788    byte offset; we can do much more for larger data types, but have no way
21789    to determine the size of the access.  We assume accesses are aligned.  */
21790 #undef TARGET_MAX_ANCHOR_OFFSET
21791 #define TARGET_MAX_ANCHOR_OFFSET 4095
21792
21793 #undef TARGET_VECTOR_ALIGNMENT
21794 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
21795
21796 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
21797 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
21798   aarch64_vectorize_preferred_vector_alignment
21799 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
21800 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
21801   aarch64_simd_vector_alignment_reachable
21802
21803 /* vec_perm support.  */
21804
21805 #undef TARGET_VECTORIZE_VEC_PERM_CONST
21806 #define TARGET_VECTORIZE_VEC_PERM_CONST \
21807   aarch64_vectorize_vec_perm_const
21808
21809 #undef TARGET_VECTORIZE_GET_MASK_MODE
21810 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
21811 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
21812 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
21813   aarch64_empty_mask_is_expensive
21814 #undef TARGET_PREFERRED_ELSE_VALUE
21815 #define TARGET_PREFERRED_ELSE_VALUE \
21816   aarch64_preferred_else_value
21817
21818 #undef TARGET_INIT_LIBFUNCS
21819 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
21820
21821 #undef TARGET_FIXED_CONDITION_CODE_REGS
21822 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
21823
21824 #undef TARGET_FLAGS_REGNUM
21825 #define TARGET_FLAGS_REGNUM CC_REGNUM
21826
21827 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
21828 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
21829
21830 #undef TARGET_ASAN_SHADOW_OFFSET
21831 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
21832
21833 #undef TARGET_LEGITIMIZE_ADDRESS
21834 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
21835
21836 #undef TARGET_SCHED_CAN_SPECULATE_INSN
21837 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
21838
21839 #undef TARGET_CAN_USE_DOLOOP_P
21840 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
21841
21842 #undef TARGET_SCHED_ADJUST_PRIORITY
21843 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
21844
21845 #undef TARGET_SCHED_MACRO_FUSION_P
21846 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
21847
21848 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
21849 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
21850
21851 #undef TARGET_SCHED_FUSION_PRIORITY
21852 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
21853
21854 #undef TARGET_UNSPEC_MAY_TRAP_P
21855 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
21856
21857 #undef TARGET_USE_PSEUDO_PIC_REG
21858 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
21859
21860 #undef TARGET_PRINT_OPERAND
21861 #define TARGET_PRINT_OPERAND aarch64_print_operand
21862
21863 #undef TARGET_PRINT_OPERAND_ADDRESS
21864 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
21865
21866 #undef TARGET_OPTAB_SUPPORTED_P
21867 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
21868
21869 #undef TARGET_OMIT_STRUCT_RETURN_REG
21870 #define TARGET_OMIT_STRUCT_RETURN_REG true
21871
21872 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
21873 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
21874   aarch64_dwarf_poly_indeterminate_value
21875
21876 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
21877 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
21878 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
21879
21880 #undef TARGET_HARD_REGNO_NREGS
21881 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
21882 #undef TARGET_HARD_REGNO_MODE_OK
21883 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
21884
21885 #undef TARGET_MODES_TIEABLE_P
21886 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
21887
21888 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
21889 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
21890   aarch64_hard_regno_call_part_clobbered
21891
21892 #undef TARGET_INSN_CALLEE_ABI
21893 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
21894
21895 #undef TARGET_CONSTANT_ALIGNMENT
21896 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
21897
21898 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
21899 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
21900   aarch64_stack_clash_protection_alloca_probe_range
21901
21902 #undef TARGET_COMPUTE_PRESSURE_CLASSES
21903 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
21904
21905 #undef TARGET_CAN_CHANGE_MODE_CLASS
21906 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
21907
21908 #undef TARGET_SELECT_EARLY_REMAT_MODES
21909 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
21910
21911 #undef TARGET_SPECULATION_SAFE_VALUE
21912 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
21913
21914 #undef TARGET_ESTIMATED_POLY_VALUE
21915 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
21916
21917 #undef TARGET_ATTRIBUTE_TABLE
21918 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
21919
21920 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
21921 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
21922   aarch64_simd_clone_compute_vecsize_and_simdlen
21923
21924 #undef TARGET_SIMD_CLONE_ADJUST
21925 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
21926
21927 #undef TARGET_SIMD_CLONE_USABLE
21928 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
21929
21930 #undef TARGET_COMP_TYPE_ATTRIBUTES
21931 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
21932
21933 #undef TARGET_GET_MULTILIB_ABI_NAME
21934 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
21935
21936 #undef TARGET_FNTYPE_ABI
21937 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
21938
21939 #if CHECKING_P
21940 #undef TARGET_RUN_TARGET_SELFTESTS
21941 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
21942 #endif /* #if CHECKING_P */
21943
21944 #undef TARGET_ASM_POST_CFI_STARTPROC
21945 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
21946
21947 #undef TARGET_STRICT_ARGUMENT_NAMING
21948 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
21949
21950 struct gcc_target targetm = TARGET_INITIALIZER;
21951
21952 #include "gt-aarch64.h"