gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2019 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "cgraph.h"
  44 #include "diagnostic.h"
  45 #include "insn-attr.h"
  46 #include "alias.h"
  47 #include "fold-const.h"
  48 #include "stor-layout.h"
  49 #include "calls.h"
  50 #include "varasm.h"
  51 #include "output.h"
  52 #include "flags.h"
  53 #include "explow.h"
  54 #include "expr.h"
  55 #include "reload.h"
  56 #include "langhooks.h"
  57 #include "opts.h"
  58 #include "params.h"
  59 #include "gimplify.h"
  60 #include "dwarf2.h"
  61 #include "gimple-iterator.h"
  62 #include "tree-vectorizer.h"
  63 #include "aarch64-cost-tables.h"
  64 #include "dumpfile.h"
  65 #include "builtins.h"
  66 #include "rtl-iter.h"
  67 #include "tm-constrs.h"
  68 #include "sched-int.h"
  69 #include "target-globals.h"
  70 #include "common/common-target.h"
  71 #include "cfgrtl.h"
  72 #include "selftest.h"
  73 #include "selftest-rtl.h"
  74 #include "rtx-vector-builder.h"
  75 #include "intl.h"
  76 #include "expmed.h"
  77 #include "function-abi.h"
  78
  79 /* This file should be included last.  */
  80 #include "target-def.h"
  81
  82 /* Defined for convenience.  */
  83 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  84
  85 /* Information about a legitimate vector immediate operand.  */
  86 struct simd_immediate_info
  87 {
  88   enum insn_type { MOV, MVN, INDEX, PTRUE };
  89   enum modifier_type { LSL, MSL };
  90
  91   simd_immediate_info () {}
  92   simd_immediate_info (scalar_float_mode, rtx);
  93   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
  94                        insn_type = MOV, modifier_type = LSL,
  95                        unsigned int = 0);
  96   simd_immediate_info (scalar_mode, rtx, rtx);
  97   simd_immediate_info (scalar_int_mode, aarch64_svpattern);
  98
  99   /* The mode of the elements.  */
 100   scalar_mode elt_mode;
 101
 102   /* The instruction to use to move the immediate into a vector.  */
 103   insn_type insn;
 104
 105   union
 106   {
 107     /* For MOV and MVN.  */
 108     struct
 109     {
 110       /* The value of each element.  */
 111       rtx value;
 112
 113       /* The kind of shift modifier to use, and the number of bits to shift.
 114          This is (LSL, 0) if no shift is needed.  */
 115       modifier_type modifier;
 116       unsigned int shift;
 117     } mov;
 118
 119     /* For INDEX.  */
 120     struct
 121     {
 122       /* The value of the first element and the step to be added for each
 123          subsequent element.  */
 124       rtx base, step;
 125     } index;
 126
 127     /* For PTRUE.  */
 128     aarch64_svpattern pattern;
 129   } u;
 130 };
 131
 132 /* Construct a floating-point immediate in which each element has mode
 133    ELT_MODE_IN and value VALUE_IN.  */
 134 inline simd_immediate_info
 135 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 136   : elt_mode (elt_mode_in), insn (MOV)
 137 {
 138   u.mov.value = value_in;
 139   u.mov.modifier = LSL;
 140   u.mov.shift = 0;
 141 }
 142
 143 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 144    and value VALUE_IN.  The other parameters are as for the structure
 145    fields.  */
 146 inline simd_immediate_info
 147 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 148                        unsigned HOST_WIDE_INT value_in,
 149                        insn_type insn_in, modifier_type modifier_in,
 150                        unsigned int shift_in)
 151   : elt_mode (elt_mode_in), insn (insn_in)
 152 {
 153   u.mov.value = gen_int_mode (value_in, elt_mode_in);
 154   u.mov.modifier = modifier_in;
 155   u.mov.shift = shift_in;
 156 }
 157
 158 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 159    and where element I is equal to BASE_IN + I * STEP_IN.  */
 160 inline simd_immediate_info
 161 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
 162   : elt_mode (elt_mode_in), insn (INDEX)
 163 {
 164   u.index.base = base_in;
 165   u.index.step = step_in;
 166 }
 167
 168 /* Construct a predicate that controls elements of mode ELT_MODE_IN
 169    and has PTRUE pattern PATTERN_IN.  */
 170 inline simd_immediate_info
 171 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 172                        aarch64_svpattern pattern_in)
 173   : elt_mode (elt_mode_in), insn (PTRUE)
 174 {
 175   u.pattern = pattern_in;
 176 }
 177
 178 /* The current code model.  */
 179 enum aarch64_code_model aarch64_cmodel;
 180
 181 /* The number of 64-bit elements in an SVE vector.  */
 182 poly_uint16 aarch64_sve_vg;
 183
 184 #ifdef HAVE_AS_TLS
 185 #undef TARGET_HAVE_TLS
 186 #define TARGET_HAVE_TLS 1
 187 #endif
 188
 189 static bool aarch64_composite_type_p (const_tree, machine_mode);
 190 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 191                                                      const_tree,
 192                                                      machine_mode *, int *,
 193                                                      bool *);
 194 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 195 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 196 static void aarch64_override_options_after_change (void);
 197 static bool aarch64_vector_mode_supported_p (machine_mode);
 198 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 199 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 200                                                          const_tree type,
 201                                                          int misalignment,
 202                                                          bool is_packed);
 203 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 204 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 205                                             aarch64_addr_query_type);
 206 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
 207
 208 /* Major revision number of the ARM Architecture implemented by the target.  */
 209 unsigned aarch64_architecture_version;
 210
 211 /* The processor for which instructions should be scheduled.  */
 212 enum aarch64_processor aarch64_tune = cortexa53;
 213
 214 /* Mask to specify which instruction scheduling options should be used.  */
 215 uint64_t aarch64_tune_flags = 0;
 216
 217 /* Global flag for PC relative loads.  */
 218 bool aarch64_pcrelative_literal_loads;
 219
 220 /* Global flag for whether frame pointer is enabled.  */
 221 bool aarch64_use_frame_pointer;
 222
 223 #define BRANCH_PROTECT_STR_MAX 255
 224 char *accepted_branch_protection_string = NULL;
 225
 226 static enum aarch64_parse_opt_result
 227 aarch64_parse_branch_protection (const char*, char**);
 228
 229 /* Support for command line parsing of boolean flags in the tuning
 230    structures.  */
 231 struct aarch64_flag_desc
 232 {
 233   const char* name;
 234   unsigned int flag;
 235 };
 236
 237 #define AARCH64_FUSION_PAIR(name, internal_name) \
 238   { name, AARCH64_FUSE_##internal_name },
 239 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 240 {
 241   { "none", AARCH64_FUSE_NOTHING },
 242 #include "aarch64-fusion-pairs.def"
 243   { "all", AARCH64_FUSE_ALL },
 244   { NULL, AARCH64_FUSE_NOTHING }
 245 };
 246
 247 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 248   { name, AARCH64_EXTRA_TUNE_##internal_name },
 249 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 250 {
 251   { "none", AARCH64_EXTRA_TUNE_NONE },
 252 #include "aarch64-tuning-flags.def"
 253   { "all", AARCH64_EXTRA_TUNE_ALL },
 254   { NULL, AARCH64_EXTRA_TUNE_NONE }
 255 };
 256
 257 /* Tuning parameters.  */
 258
 259 static const struct cpu_addrcost_table generic_addrcost_table =
 260 {
 261     {
 262       1, /* hi  */
 263       0, /* si  */
 264       0, /* di  */
 265       1, /* ti  */
 266     },
 267   0, /* pre_modify  */
 268   0, /* post_modify  */
 269   0, /* register_offset  */
 270   0, /* register_sextend  */
 271   0, /* register_zextend  */
 272   0 /* imm_offset  */
 273 };
 274
 275 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 276 {
 277     {
 278       0, /* hi  */
 279       0, /* si  */
 280       0, /* di  */
 281       2, /* ti  */
 282     },
 283   0, /* pre_modify  */
 284   0, /* post_modify  */
 285   1, /* register_offset  */
 286   1, /* register_sextend  */
 287   2, /* register_zextend  */
 288   0, /* imm_offset  */
 289 };
 290
 291 static const struct cpu_addrcost_table xgene1_addrcost_table =
 292 {
 293     {
 294       1, /* hi  */
 295       0, /* si  */
 296       0, /* di  */
 297       1, /* ti  */
 298     },
 299   1, /* pre_modify  */
 300   1, /* post_modify  */
 301   0, /* register_offset  */
 302   1, /* register_sextend  */
 303   1, /* register_zextend  */
 304   0, /* imm_offset  */
 305 };
 306
 307 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 308 {
 309     {
 310       1, /* hi  */
 311       1, /* si  */
 312       1, /* di  */
 313       2, /* ti  */
 314     },
 315   0, /* pre_modify  */
 316   0, /* post_modify  */
 317   2, /* register_offset  */
 318   3, /* register_sextend  */
 319   3, /* register_zextend  */
 320   0, /* imm_offset  */
 321 };
 322
 323 static const struct cpu_addrcost_table tsv110_addrcost_table =
 324 {
 325     {
 326       1, /* hi  */
 327       0, /* si  */
 328       0, /* di  */
 329       1, /* ti  */
 330     },
 331   0, /* pre_modify  */
 332   0, /* post_modify  */
 333   0, /* register_offset  */
 334   1, /* register_sextend  */
 335   1, /* register_zextend  */
 336   0, /* imm_offset  */
 337 };
 338
 339 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 340 {
 341     {
 342       1, /* hi  */
 343       1, /* si  */
 344       1, /* di  */
 345       2, /* ti  */
 346     },
 347   1, /* pre_modify  */
 348   1, /* post_modify  */
 349   3, /* register_offset  */
 350   3, /* register_sextend  */
 351   3, /* register_zextend  */
 352   2, /* imm_offset  */
 353 };
 354
 355 static const struct cpu_regmove_cost generic_regmove_cost =
 356 {
 357   1, /* GP2GP  */
 358   /* Avoid the use of slow int<->fp moves for spilling by setting
 359      their cost higher than memmov_cost.  */
 360   5, /* GP2FP  */
 361   5, /* FP2GP  */
 362   2 /* FP2FP  */
 363 };
 364
 365 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 366 {
 367   1, /* GP2GP  */
 368   /* Avoid the use of slow int<->fp moves for spilling by setting
 369      their cost higher than memmov_cost.  */
 370   5, /* GP2FP  */
 371   5, /* FP2GP  */
 372   2 /* FP2FP  */
 373 };
 374
 375 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 376 {
 377   1, /* GP2GP  */
 378   /* Avoid the use of slow int<->fp moves for spilling by setting
 379      their cost higher than memmov_cost.  */
 380   5, /* GP2FP  */
 381   5, /* FP2GP  */
 382   2 /* FP2FP  */
 383 };
 384
 385 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 386 {
 387   1, /* GP2GP  */
 388   /* Avoid the use of slow int<->fp moves for spilling by setting
 389      their cost higher than memmov_cost (actual, 4 and 9).  */
 390   9, /* GP2FP  */
 391   9, /* FP2GP  */
 392   1 /* FP2FP  */
 393 };
 394
 395 static const struct cpu_regmove_cost thunderx_regmove_cost =
 396 {
 397   2, /* GP2GP  */
 398   2, /* GP2FP  */
 399   6, /* FP2GP  */
 400   4 /* FP2FP  */
 401 };
 402
 403 static const struct cpu_regmove_cost xgene1_regmove_cost =
 404 {
 405   1, /* GP2GP  */
 406   /* Avoid the use of slow int<->fp moves for spilling by setting
 407      their cost higher than memmov_cost.  */
 408   8, /* GP2FP  */
 409   8, /* FP2GP  */
 410   2 /* FP2FP  */
 411 };
 412
 413 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 414 {
 415   2, /* GP2GP  */
 416   /* Avoid the use of int<->fp moves for spilling.  */
 417   6, /* GP2FP  */
 418   6, /* FP2GP  */
 419   4 /* FP2FP  */
 420 };
 421
 422 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 423 {
 424   1, /* GP2GP  */
 425   /* Avoid the use of int<->fp moves for spilling.  */
 426   8, /* GP2FP  */
 427   8, /* FP2GP  */
 428   4  /* FP2FP  */
 429 };
 430
 431 static const struct cpu_regmove_cost tsv110_regmove_cost =
 432 {
 433   1, /* GP2GP  */
 434   /* Avoid the use of slow int<->fp moves for spilling by setting
 435      their cost higher than memmov_cost.  */
 436   2, /* GP2FP  */
 437   3, /* FP2GP  */
 438   2  /* FP2FP  */
 439 };
 440
 441 /* Generic costs for vector insn classes.  */
 442 static const struct cpu_vector_cost generic_vector_cost =
 443 {
 444   1, /* scalar_int_stmt_cost  */
 445   1, /* scalar_fp_stmt_cost  */
 446   1, /* scalar_load_cost  */
 447   1, /* scalar_store_cost  */
 448   1, /* vec_int_stmt_cost  */
 449   1, /* vec_fp_stmt_cost  */
 450   2, /* vec_permute_cost  */
 451   1, /* vec_to_scalar_cost  */
 452   1, /* scalar_to_vec_cost  */
 453   1, /* vec_align_load_cost  */
 454   1, /* vec_unalign_load_cost  */
 455   1, /* vec_unalign_store_cost  */
 456   1, /* vec_store_cost  */
 457   3, /* cond_taken_branch_cost  */
 458   1 /* cond_not_taken_branch_cost  */
 459 };
 460
 461 /* QDF24XX costs for vector insn classes.  */
 462 static const struct cpu_vector_cost qdf24xx_vector_cost =
 463 {
 464   1, /* scalar_int_stmt_cost  */
 465   1, /* scalar_fp_stmt_cost  */
 466   1, /* scalar_load_cost  */
 467   1, /* scalar_store_cost  */
 468   1, /* vec_int_stmt_cost  */
 469   3, /* vec_fp_stmt_cost  */
 470   2, /* vec_permute_cost  */
 471   1, /* vec_to_scalar_cost  */
 472   1, /* scalar_to_vec_cost  */
 473   1, /* vec_align_load_cost  */
 474   1, /* vec_unalign_load_cost  */
 475   1, /* vec_unalign_store_cost  */
 476   1, /* vec_store_cost  */
 477   3, /* cond_taken_branch_cost  */
 478   1 /* cond_not_taken_branch_cost  */
 479 };
 480
 481 /* ThunderX costs for vector insn classes.  */
 482 static const struct cpu_vector_cost thunderx_vector_cost =
 483 {
 484   1, /* scalar_int_stmt_cost  */
 485   1, /* scalar_fp_stmt_cost  */
 486   3, /* scalar_load_cost  */
 487   1, /* scalar_store_cost  */
 488   4, /* vec_int_stmt_cost  */
 489   1, /* vec_fp_stmt_cost  */
 490   4, /* vec_permute_cost  */
 491   2, /* vec_to_scalar_cost  */
 492   2, /* scalar_to_vec_cost  */
 493   3, /* vec_align_load_cost  */
 494   5, /* vec_unalign_load_cost  */
 495   5, /* vec_unalign_store_cost  */
 496   1, /* vec_store_cost  */
 497   3, /* cond_taken_branch_cost  */
 498   3 /* cond_not_taken_branch_cost  */
 499 };
 500
 501 static const struct cpu_vector_cost tsv110_vector_cost =
 502 {
 503   1, /* scalar_int_stmt_cost  */
 504   1, /* scalar_fp_stmt_cost  */
 505   5, /* scalar_load_cost  */
 506   1, /* scalar_store_cost  */
 507   2, /* vec_int_stmt_cost  */
 508   2, /* vec_fp_stmt_cost  */
 509   2, /* vec_permute_cost  */
 510   3, /* vec_to_scalar_cost  */
 511   2, /* scalar_to_vec_cost  */
 512   5, /* vec_align_load_cost  */
 513   5, /* vec_unalign_load_cost  */
 514   1, /* vec_unalign_store_cost  */
 515   1, /* vec_store_cost  */
 516   1, /* cond_taken_branch_cost  */
 517   1 /* cond_not_taken_branch_cost  */
 518 };
 519
 520 /* Generic costs for vector insn classes.  */
 521 static const struct cpu_vector_cost cortexa57_vector_cost =
 522 {
 523   1, /* scalar_int_stmt_cost  */
 524   1, /* scalar_fp_stmt_cost  */
 525   4, /* scalar_load_cost  */
 526   1, /* scalar_store_cost  */
 527   2, /* vec_int_stmt_cost  */
 528   2, /* vec_fp_stmt_cost  */
 529   3, /* vec_permute_cost  */
 530   8, /* vec_to_scalar_cost  */
 531   8, /* scalar_to_vec_cost  */
 532   4, /* vec_align_load_cost  */
 533   4, /* vec_unalign_load_cost  */
 534   1, /* vec_unalign_store_cost  */
 535   1, /* vec_store_cost  */
 536   1, /* cond_taken_branch_cost  */
 537   1 /* cond_not_taken_branch_cost  */
 538 };
 539
 540 static const struct cpu_vector_cost exynosm1_vector_cost =
 541 {
 542   1, /* scalar_int_stmt_cost  */
 543   1, /* scalar_fp_stmt_cost  */
 544   5, /* scalar_load_cost  */
 545   1, /* scalar_store_cost  */
 546   3, /* vec_int_stmt_cost  */
 547   3, /* vec_fp_stmt_cost  */
 548   3, /* vec_permute_cost  */
 549   3, /* vec_to_scalar_cost  */
 550   3, /* scalar_to_vec_cost  */
 551   5, /* vec_align_load_cost  */
 552   5, /* vec_unalign_load_cost  */
 553   1, /* vec_unalign_store_cost  */
 554   1, /* vec_store_cost  */
 555   1, /* cond_taken_branch_cost  */
 556   1 /* cond_not_taken_branch_cost  */
 557 };
 558
 559 /* Generic costs for vector insn classes.  */
 560 static const struct cpu_vector_cost xgene1_vector_cost =
 561 {
 562   1, /* scalar_int_stmt_cost  */
 563   1, /* scalar_fp_stmt_cost  */
 564   5, /* scalar_load_cost  */
 565   1, /* scalar_store_cost  */
 566   2, /* vec_int_stmt_cost  */
 567   2, /* vec_fp_stmt_cost  */
 568   2, /* vec_permute_cost  */
 569   4, /* vec_to_scalar_cost  */
 570   4, /* scalar_to_vec_cost  */
 571   10, /* vec_align_load_cost  */
 572   10, /* vec_unalign_load_cost  */
 573   2, /* vec_unalign_store_cost  */
 574   2, /* vec_store_cost  */
 575   2, /* cond_taken_branch_cost  */
 576   1 /* cond_not_taken_branch_cost  */
 577 };
 578
 579 /* Costs for vector insn classes for Vulcan.  */
 580 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 581 {
 582   1, /* scalar_int_stmt_cost  */
 583   6, /* scalar_fp_stmt_cost  */
 584   4, /* scalar_load_cost  */
 585   1, /* scalar_store_cost  */
 586   5, /* vec_int_stmt_cost  */
 587   6, /* vec_fp_stmt_cost  */
 588   3, /* vec_permute_cost  */
 589   6, /* vec_to_scalar_cost  */
 590   5, /* scalar_to_vec_cost  */
 591   8, /* vec_align_load_cost  */
 592   8, /* vec_unalign_load_cost  */
 593   4, /* vec_unalign_store_cost  */
 594   4, /* vec_store_cost  */
 595   2, /* cond_taken_branch_cost  */
 596   1  /* cond_not_taken_branch_cost  */
 597 };
 598
 599 /* Generic costs for branch instructions.  */
 600 static const struct cpu_branch_cost generic_branch_cost =
 601 {
 602   1,  /* Predictable.  */
 603   3   /* Unpredictable.  */
 604 };
 605
 606 /* Generic approximation modes.  */
 607 static const cpu_approx_modes generic_approx_modes =
 608 {
 609   AARCH64_APPROX_NONE,  /* division  */
 610   AARCH64_APPROX_NONE,  /* sqrt  */
 611   AARCH64_APPROX_NONE   /* recip_sqrt  */
 612 };
 613
 614 /* Approximation modes for Exynos M1.  */
 615 static const cpu_approx_modes exynosm1_approx_modes =
 616 {
 617   AARCH64_APPROX_NONE,  /* division  */
 618   AARCH64_APPROX_ALL,   /* sqrt  */
 619   AARCH64_APPROX_ALL    /* recip_sqrt  */
 620 };
 621
 622 /* Approximation modes for X-Gene 1.  */
 623 static const cpu_approx_modes xgene1_approx_modes =
 624 {
 625   AARCH64_APPROX_NONE,  /* division  */
 626   AARCH64_APPROX_NONE,  /* sqrt  */
 627   AARCH64_APPROX_ALL    /* recip_sqrt  */
 628 };
 629
 630 /* Generic prefetch settings (which disable prefetch).  */
 631 static const cpu_prefetch_tune generic_prefetch_tune =
 632 {
 633   0,                    /* num_slots  */
 634   -1,                   /* l1_cache_size  */
 635   -1,                   /* l1_cache_line_size  */
 636   -1,                   /* l2_cache_size  */
 637   true,                 /* prefetch_dynamic_strides */
 638   -1,                   /* minimum_stride */
 639   -1                    /* default_opt_level  */
 640 };
 641
 642 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 643 {
 644   0,                    /* num_slots  */
 645   -1,                   /* l1_cache_size  */
 646   64,                   /* l1_cache_line_size  */
 647   -1,                   /* l2_cache_size  */
 648   true,                 /* prefetch_dynamic_strides */
 649   -1,                   /* minimum_stride */
 650   -1                    /* default_opt_level  */
 651 };
 652
 653 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 654 {
 655   4,                    /* num_slots  */
 656   32,                   /* l1_cache_size  */
 657   64,                   /* l1_cache_line_size  */
 658   512,                  /* l2_cache_size  */
 659   false,                /* prefetch_dynamic_strides */
 660   2048,                 /* minimum_stride */
 661   3                     /* default_opt_level  */
 662 };
 663
 664 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 665 {
 666   8,                    /* num_slots  */
 667   32,                   /* l1_cache_size  */
 668   128,                  /* l1_cache_line_size  */
 669   16*1024,              /* l2_cache_size  */
 670   true,                 /* prefetch_dynamic_strides */
 671   -1,                   /* minimum_stride */
 672   3                     /* default_opt_level  */
 673 };
 674
 675 static const cpu_prefetch_tune thunderx_prefetch_tune =
 676 {
 677   8,                    /* num_slots  */
 678   32,                   /* l1_cache_size  */
 679   128,                  /* l1_cache_line_size  */
 680   -1,                   /* l2_cache_size  */
 681   true,                 /* prefetch_dynamic_strides */
 682   -1,                   /* minimum_stride */
 683   -1                    /* default_opt_level  */
 684 };
 685
 686 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 687 {
 688   8,                    /* num_slots  */
 689   32,                   /* l1_cache_size  */
 690   64,                   /* l1_cache_line_size  */
 691   256,                  /* l2_cache_size  */
 692   true,                 /* prefetch_dynamic_strides */
 693   -1,                   /* minimum_stride */
 694   -1                    /* default_opt_level  */
 695 };
 696
 697 static const cpu_prefetch_tune tsv110_prefetch_tune =
 698 {
 699   0,                    /* num_slots  */
 700   64,                   /* l1_cache_size  */
 701   64,                   /* l1_cache_line_size  */
 702   512,                  /* l2_cache_size  */
 703   true,                 /* prefetch_dynamic_strides */
 704   -1,                   /* minimum_stride */
 705   -1                    /* default_opt_level  */
 706 };
 707
 708 static const cpu_prefetch_tune xgene1_prefetch_tune =
 709 {
 710   8,                    /* num_slots  */
 711   32,                   /* l1_cache_size  */
 712   64,                   /* l1_cache_line_size  */
 713   256,                  /* l2_cache_size  */
 714   true,                 /* prefetch_dynamic_strides */
 715   -1,                   /* minimum_stride */
 716   -1                    /* default_opt_level  */
 717 };
 718
 719 static const struct tune_params generic_tunings =
 720 {
 721   &cortexa57_extra_costs,
 722   &generic_addrcost_table,
 723   &generic_regmove_cost,
 724   &generic_vector_cost,
 725   &generic_branch_cost,
 726   &generic_approx_modes,
 727   SVE_NOT_IMPLEMENTED, /* sve_width  */
 728   4, /* memmov_cost  */
 729   2, /* issue_rate  */
 730   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 731   "16:12",      /* function_align.  */
 732   "4",  /* jump_align.  */
 733   "8",  /* loop_align.  */
 734   2,    /* int_reassoc_width.  */
 735   4,    /* fp_reassoc_width.  */
 736   1,    /* vec_reassoc_width.  */
 737   2,    /* min_div_recip_mul_sf.  */
 738   2,    /* min_div_recip_mul_df.  */
 739   0,    /* max_case_values.  */
 740   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 741   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 742   &generic_prefetch_tune
 743 };
 744
 745 static const struct tune_params cortexa35_tunings =
 746 {
 747   &cortexa53_extra_costs,
 748   &generic_addrcost_table,
 749   &cortexa53_regmove_cost,
 750   &generic_vector_cost,
 751   &generic_branch_cost,
 752   &generic_approx_modes,
 753   SVE_NOT_IMPLEMENTED, /* sve_width  */
 754   4, /* memmov_cost  */
 755   1, /* issue_rate  */
 756   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 757    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 758   "16", /* function_align.  */
 759   "4",  /* jump_align.  */
 760   "8",  /* loop_align.  */
 761   2,    /* int_reassoc_width.  */
 762   4,    /* fp_reassoc_width.  */
 763   1,    /* vec_reassoc_width.  */
 764   2,    /* min_div_recip_mul_sf.  */
 765   2,    /* min_div_recip_mul_df.  */
 766   0,    /* max_case_values.  */
 767   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 768   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 769   &generic_prefetch_tune
 770 };
 771
 772 static const struct tune_params cortexa53_tunings =
 773 {
 774   &cortexa53_extra_costs,
 775   &generic_addrcost_table,
 776   &cortexa53_regmove_cost,
 777   &generic_vector_cost,
 778   &generic_branch_cost,
 779   &generic_approx_modes,
 780   SVE_NOT_IMPLEMENTED, /* sve_width  */
 781   4, /* memmov_cost  */
 782   2, /* issue_rate  */
 783   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 784    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 785   "16", /* function_align.  */
 786   "4",  /* jump_align.  */
 787   "8",  /* loop_align.  */
 788   2,    /* int_reassoc_width.  */
 789   4,    /* fp_reassoc_width.  */
 790   1,    /* vec_reassoc_width.  */
 791   2,    /* min_div_recip_mul_sf.  */
 792   2,    /* min_div_recip_mul_df.  */
 793   0,    /* max_case_values.  */
 794   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 795   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 796   &generic_prefetch_tune
 797 };
 798
 799 static const struct tune_params cortexa57_tunings =
 800 {
 801   &cortexa57_extra_costs,
 802   &generic_addrcost_table,
 803   &cortexa57_regmove_cost,
 804   &cortexa57_vector_cost,
 805   &generic_branch_cost,
 806   &generic_approx_modes,
 807   SVE_NOT_IMPLEMENTED, /* sve_width  */
 808   4, /* memmov_cost  */
 809   3, /* issue_rate  */
 810   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 811    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 812   "16", /* function_align.  */
 813   "4",  /* jump_align.  */
 814   "8",  /* loop_align.  */
 815   2,    /* int_reassoc_width.  */
 816   4,    /* fp_reassoc_width.  */
 817   1,    /* vec_reassoc_width.  */
 818   2,    /* min_div_recip_mul_sf.  */
 819   2,    /* min_div_recip_mul_df.  */
 820   0,    /* max_case_values.  */
 821   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 822   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 823   &generic_prefetch_tune
 824 };
 825
 826 static const struct tune_params cortexa72_tunings =
 827 {
 828   &cortexa57_extra_costs,
 829   &generic_addrcost_table,
 830   &cortexa57_regmove_cost,
 831   &cortexa57_vector_cost,
 832   &generic_branch_cost,
 833   &generic_approx_modes,
 834   SVE_NOT_IMPLEMENTED, /* sve_width  */
 835   4, /* memmov_cost  */
 836   3, /* issue_rate  */
 837   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 838    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 839   "16", /* function_align.  */
 840   "4",  /* jump_align.  */
 841   "8",  /* loop_align.  */
 842   2,    /* int_reassoc_width.  */
 843   4,    /* fp_reassoc_width.  */
 844   1,    /* vec_reassoc_width.  */
 845   2,    /* min_div_recip_mul_sf.  */
 846   2,    /* min_div_recip_mul_df.  */
 847   0,    /* max_case_values.  */
 848   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 849   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 850   &generic_prefetch_tune
 851 };
 852
 853 static const struct tune_params cortexa73_tunings =
 854 {
 855   &cortexa57_extra_costs,
 856   &generic_addrcost_table,
 857   &cortexa57_regmove_cost,
 858   &cortexa57_vector_cost,
 859   &generic_branch_cost,
 860   &generic_approx_modes,
 861   SVE_NOT_IMPLEMENTED, /* sve_width  */
 862   4, /* memmov_cost.  */
 863   2, /* issue_rate.  */
 864   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 865    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 866   "16", /* function_align.  */
 867   "4",  /* jump_align.  */
 868   "8",  /* loop_align.  */
 869   2,    /* int_reassoc_width.  */
 870   4,    /* fp_reassoc_width.  */
 871   1,    /* vec_reassoc_width.  */
 872   2,    /* min_div_recip_mul_sf.  */
 873   2,    /* min_div_recip_mul_df.  */
 874   0,    /* max_case_values.  */
 875   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 876   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 877   &generic_prefetch_tune
 878 };
 879
 880
 881
 882 static const struct tune_params exynosm1_tunings =
 883 {
 884   &exynosm1_extra_costs,
 885   &exynosm1_addrcost_table,
 886   &exynosm1_regmove_cost,
 887   &exynosm1_vector_cost,
 888   &generic_branch_cost,
 889   &exynosm1_approx_modes,
 890   SVE_NOT_IMPLEMENTED, /* sve_width  */
 891   4,    /* memmov_cost  */
 892   3,    /* issue_rate  */
 893   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 894   "4",  /* function_align.  */
 895   "4",  /* jump_align.  */
 896   "4",  /* loop_align.  */
 897   2,    /* int_reassoc_width.  */
 898   4,    /* fp_reassoc_width.  */
 899   1,    /* vec_reassoc_width.  */
 900   2,    /* min_div_recip_mul_sf.  */
 901   2,    /* min_div_recip_mul_df.  */
 902   48,   /* max_case_values.  */
 903   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 904   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 905   &exynosm1_prefetch_tune
 906 };
 907
 908 static const struct tune_params thunderxt88_tunings =
 909 {
 910   &thunderx_extra_costs,
 911   &generic_addrcost_table,
 912   &thunderx_regmove_cost,
 913   &thunderx_vector_cost,
 914   &generic_branch_cost,
 915   &generic_approx_modes,
 916   SVE_NOT_IMPLEMENTED, /* sve_width  */
 917   6, /* memmov_cost  */
 918   2, /* issue_rate  */
 919   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 920   "8",  /* function_align.  */
 921   "8",  /* jump_align.  */
 922   "8",  /* loop_align.  */
 923   2,    /* int_reassoc_width.  */
 924   4,    /* fp_reassoc_width.  */
 925   1,    /* vec_reassoc_width.  */
 926   2,    /* min_div_recip_mul_sf.  */
 927   2,    /* min_div_recip_mul_df.  */
 928   0,    /* max_case_values.  */
 929   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 930   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 931   &thunderxt88_prefetch_tune
 932 };
 933
 934 static const struct tune_params thunderx_tunings =
 935 {
 936   &thunderx_extra_costs,
 937   &generic_addrcost_table,
 938   &thunderx_regmove_cost,
 939   &thunderx_vector_cost,
 940   &generic_branch_cost,
 941   &generic_approx_modes,
 942   SVE_NOT_IMPLEMENTED, /* sve_width  */
 943   6, /* memmov_cost  */
 944   2, /* issue_rate  */
 945   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 946   "8",  /* function_align.  */
 947   "8",  /* jump_align.  */
 948   "8",  /* loop_align.  */
 949   2,    /* int_reassoc_width.  */
 950   4,    /* fp_reassoc_width.  */
 951   1,    /* vec_reassoc_width.  */
 952   2,    /* min_div_recip_mul_sf.  */
 953   2,    /* min_div_recip_mul_df.  */
 954   0,    /* max_case_values.  */
 955   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 956   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 957    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 958   &thunderx_prefetch_tune
 959 };
 960
 961 static const struct tune_params tsv110_tunings =
 962 {
 963   &tsv110_extra_costs,
 964   &tsv110_addrcost_table,
 965   &tsv110_regmove_cost,
 966   &tsv110_vector_cost,
 967   &generic_branch_cost,
 968   &generic_approx_modes,
 969   SVE_NOT_IMPLEMENTED, /* sve_width  */
 970   4,    /* memmov_cost  */
 971   4,    /* issue_rate  */
 972   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
 973    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 974   "16", /* function_align.  */
 975   "4",  /* jump_align.  */
 976   "8",  /* loop_align.  */
 977   2,    /* int_reassoc_width.  */
 978   4,    /* fp_reassoc_width.  */
 979   1,    /* vec_reassoc_width.  */
 980   2,    /* min_div_recip_mul_sf.  */
 981   2,    /* min_div_recip_mul_df.  */
 982   0,    /* max_case_values.  */
 983   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 984   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
 985   &tsv110_prefetch_tune
 986 };
 987
 988 static const struct tune_params xgene1_tunings =
 989 {
 990   &xgene1_extra_costs,
 991   &xgene1_addrcost_table,
 992   &xgene1_regmove_cost,
 993   &xgene1_vector_cost,
 994   &generic_branch_cost,
 995   &xgene1_approx_modes,
 996   SVE_NOT_IMPLEMENTED, /* sve_width  */
 997   6, /* memmov_cost  */
 998   4, /* issue_rate  */
 999   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1000   "16", /* function_align.  */
1001   "16", /* jump_align.  */
1002   "16", /* loop_align.  */
1003   2,    /* int_reassoc_width.  */
1004   4,    /* fp_reassoc_width.  */
1005   1,    /* vec_reassoc_width.  */
1006   2,    /* min_div_recip_mul_sf.  */
1007   2,    /* min_div_recip_mul_df.  */
1008   17,   /* max_case_values.  */
1009   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1010   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1011   &xgene1_prefetch_tune
1012 };
1013
1014 static const struct tune_params emag_tunings =
1015 {
1016   &xgene1_extra_costs,
1017   &xgene1_addrcost_table,
1018   &xgene1_regmove_cost,
1019   &xgene1_vector_cost,
1020   &generic_branch_cost,
1021   &xgene1_approx_modes,
1022   SVE_NOT_IMPLEMENTED,
1023   6, /* memmov_cost  */
1024   4, /* issue_rate  */
1025   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1026   "16", /* function_align.  */
1027   "16", /* jump_align.  */
1028   "16", /* loop_align.  */
1029   2,    /* int_reassoc_width.  */
1030   4,    /* fp_reassoc_width.  */
1031   1,    /* vec_reassoc_width.  */
1032   2,    /* min_div_recip_mul_sf.  */
1033   2,    /* min_div_recip_mul_df.  */
1034   17,   /* max_case_values.  */
1035   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1036   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1037   &xgene1_prefetch_tune
1038 };
1039
1040 static const struct tune_params qdf24xx_tunings =
1041 {
1042   &qdf24xx_extra_costs,
1043   &qdf24xx_addrcost_table,
1044   &qdf24xx_regmove_cost,
1045   &qdf24xx_vector_cost,
1046   &generic_branch_cost,
1047   &generic_approx_modes,
1048   SVE_NOT_IMPLEMENTED, /* sve_width  */
1049   4, /* memmov_cost  */
1050   4, /* issue_rate  */
1051   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1052    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1053   "16", /* function_align.  */
1054   "8",  /* jump_align.  */
1055   "16", /* loop_align.  */
1056   2,    /* int_reassoc_width.  */
1057   4,    /* fp_reassoc_width.  */
1058   1,    /* vec_reassoc_width.  */
1059   2,    /* min_div_recip_mul_sf.  */
1060   2,    /* min_div_recip_mul_df.  */
1061   0,    /* max_case_values.  */
1062   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1063   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1064   &qdf24xx_prefetch_tune
1065 };
1066
1067 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1068    for now.  */
1069 static const struct tune_params saphira_tunings =
1070 {
1071   &generic_extra_costs,
1072   &generic_addrcost_table,
1073   &generic_regmove_cost,
1074   &generic_vector_cost,
1075   &generic_branch_cost,
1076   &generic_approx_modes,
1077   SVE_NOT_IMPLEMENTED, /* sve_width  */
1078   4, /* memmov_cost  */
1079   4, /* issue_rate  */
1080   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1081    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1082   "16", /* function_align.  */
1083   "8",  /* jump_align.  */
1084   "16", /* loop_align.  */
1085   2,    /* int_reassoc_width.  */
1086   4,    /* fp_reassoc_width.  */
1087   1,    /* vec_reassoc_width.  */
1088   2,    /* min_div_recip_mul_sf.  */
1089   2,    /* min_div_recip_mul_df.  */
1090   0,    /* max_case_values.  */
1091   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1092   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1093   &generic_prefetch_tune
1094 };
1095
1096 static const struct tune_params thunderx2t99_tunings =
1097 {
1098   &thunderx2t99_extra_costs,
1099   &thunderx2t99_addrcost_table,
1100   &thunderx2t99_regmove_cost,
1101   &thunderx2t99_vector_cost,
1102   &generic_branch_cost,
1103   &generic_approx_modes,
1104   SVE_NOT_IMPLEMENTED, /* sve_width  */
1105   4, /* memmov_cost.  */
1106   4, /* issue_rate.  */
1107   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1108    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
1109   "16", /* function_align.  */
1110   "8",  /* jump_align.  */
1111   "16", /* loop_align.  */
1112   3,    /* int_reassoc_width.  */
1113   2,    /* fp_reassoc_width.  */
1114   2,    /* vec_reassoc_width.  */
1115   2,    /* min_div_recip_mul_sf.  */
1116   2,    /* min_div_recip_mul_df.  */
1117   0,    /* max_case_values.  */
1118   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1119   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1120   &thunderx2t99_prefetch_tune
1121 };
1122
1123 static const struct tune_params neoversen1_tunings =
1124 {
1125   &cortexa57_extra_costs,
1126   &generic_addrcost_table,
1127   &generic_regmove_cost,
1128   &cortexa57_vector_cost,
1129   &generic_branch_cost,
1130   &generic_approx_modes,
1131   SVE_NOT_IMPLEMENTED, /* sve_width  */
1132   4, /* memmov_cost  */
1133   3, /* issue_rate  */
1134   AARCH64_FUSE_AES_AESMC, /* fusible_ops  */
1135   "32:16",      /* function_align.  */
1136   "32:16",      /* jump_align.  */
1137   "32:16",      /* loop_align.  */
1138   2,    /* int_reassoc_width.  */
1139   4,    /* fp_reassoc_width.  */
1140   2,    /* vec_reassoc_width.  */
1141   2,    /* min_div_recip_mul_sf.  */
1142   2,    /* min_div_recip_mul_df.  */
1143   0,    /* max_case_values.  */
1144   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1145   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1146   &generic_prefetch_tune
1147 };
1148
1149 /* Support for fine-grained override of the tuning structures.  */
1150 struct aarch64_tuning_override_function
1151 {
1152   const char* name;
1153   void (*parse_override)(const char*, struct tune_params*);
1154 };
1155
1156 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1157 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1158 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1159
1160 static const struct aarch64_tuning_override_function
1161 aarch64_tuning_override_functions[] =
1162 {
1163   { "fuse", aarch64_parse_fuse_string },
1164   { "tune", aarch64_parse_tune_string },
1165   { "sve_width", aarch64_parse_sve_width_string },
1166   { NULL, NULL }
1167 };
1168
1169 /* A processor implementing AArch64.  */
1170 struct processor
1171 {
1172   const char *const name;
1173   enum aarch64_processor ident;
1174   enum aarch64_processor sched_core;
1175   enum aarch64_arch arch;
1176   unsigned architecture_version;
1177   const uint64_t flags;
1178   const struct tune_params *const tune;
1179 };
1180
1181 /* Architectures implementing AArch64.  */
1182 static const struct processor all_architectures[] =
1183 {
1184 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1185   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1186 #include "aarch64-arches.def"
1187   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1188 };
1189
1190 /* Processor cores implementing AArch64.  */
1191 static const struct processor all_cores[] =
1192 {
1193 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1194   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
1195   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
1196   FLAGS, &COSTS##_tunings},
1197 #include "aarch64-cores.def"
1198   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1199     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1200   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1201 };
1202
1203
1204 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1205    handling code or by target attributes.  */
1206 static const struct processor *selected_arch;
1207 static const struct processor *selected_cpu;
1208 static const struct processor *selected_tune;
1209
1210 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1211
1212 /* The current tuning set.  */
1213 struct tune_params aarch64_tune_params = generic_tunings;
1214
1215 /* Table of machine attributes.  */
1216 static const struct attribute_spec aarch64_attribute_table[] =
1217 {
1218   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1219        affects_type_identity, handler, exclude } */
1220   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,  NULL, NULL },
1221   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
1222 };
1223
1224 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1225
1226 /* An ISA extension in the co-processor and main instruction set space.  */
1227 struct aarch64_option_extension
1228 {
1229   const char *const name;
1230   const unsigned long flags_on;
1231   const unsigned long flags_off;
1232 };
1233
1234 typedef enum aarch64_cond_code
1235 {
1236   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1237   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1238   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1239 }
1240 aarch64_cc;
1241
1242 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1243
1244 struct aarch64_branch_protect_type
1245 {
1246   /* The type's name that the user passes to the branch-protection option
1247     string.  */
1248   const char* name;
1249   /* Function to handle the protection type and set global variables.
1250     First argument is the string token corresponding with this type and the
1251     second argument is the next token in the option string.
1252     Return values:
1253     * AARCH64_PARSE_OK: Handling was sucessful.
1254     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1255       should print an error.
1256     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1257       own error.  */
1258   enum aarch64_parse_opt_result (*handler)(char*, char*);
1259   /* A list of types that can follow this type in the option string.  */
1260   const aarch64_branch_protect_type* subtypes;
1261   unsigned int num_subtypes;
1262 };
1263
1264 static enum aarch64_parse_opt_result
1265 aarch64_handle_no_branch_protection (char* str, char* rest)
1266 {
1267   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1268   aarch64_enable_bti = 0;
1269   if (rest)
1270     {
1271       error ("unexpected %<%s%> after %<%s%>", rest, str);
1272       return AARCH64_PARSE_INVALID_FEATURE;
1273     }
1274   return AARCH64_PARSE_OK;
1275 }
1276
1277 static enum aarch64_parse_opt_result
1278 aarch64_handle_standard_branch_protection (char* str, char* rest)
1279 {
1280   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1281   aarch64_ra_sign_key = AARCH64_KEY_A;
1282   aarch64_enable_bti = 1;
1283   if (rest)
1284     {
1285       error ("unexpected %<%s%> after %<%s%>", rest, str);
1286       return AARCH64_PARSE_INVALID_FEATURE;
1287     }
1288   return AARCH64_PARSE_OK;
1289 }
1290
1291 static enum aarch64_parse_opt_result
1292 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1293                                     char* rest ATTRIBUTE_UNUSED)
1294 {
1295   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1296   aarch64_ra_sign_key = AARCH64_KEY_A;
1297   return AARCH64_PARSE_OK;
1298 }
1299
1300 static enum aarch64_parse_opt_result
1301 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1302                               char* rest ATTRIBUTE_UNUSED)
1303 {
1304   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1305   return AARCH64_PARSE_OK;
1306 }
1307
1308 static enum aarch64_parse_opt_result
1309 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1310                               char* rest ATTRIBUTE_UNUSED)
1311 {
1312   aarch64_ra_sign_key = AARCH64_KEY_B;
1313   return AARCH64_PARSE_OK;
1314 }
1315
1316 static enum aarch64_parse_opt_result
1317 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1318                                     char* rest ATTRIBUTE_UNUSED)
1319 {
1320   aarch64_enable_bti = 1;
1321   return AARCH64_PARSE_OK;
1322 }
1323
1324 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1325   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1326   { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1327   { NULL, NULL, NULL, 0 }
1328 };
1329
1330 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1331   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1332   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1333   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1334     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1335   { "bti", aarch64_handle_bti_protection, NULL, 0 },
1336   { NULL, NULL, NULL, 0 }
1337 };
1338
1339 /* The condition codes of the processor, and the inverse function.  */
1340 static const char * const aarch64_condition_codes[] =
1341 {
1342   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1343   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1344 };
1345
1346 /* The preferred condition codes for SVE conditions.  */
1347 static const char *const aarch64_sve_condition_codes[] =
1348 {
1349   "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1350   "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1351 };
1352
1353 /* Return the assembly token for svpattern value VALUE.  */
1354
1355 static const char *
1356 svpattern_token (enum aarch64_svpattern pattern)
1357 {
1358   switch (pattern)
1359     {
1360 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1361     AARCH64_FOR_SVPATTERN (CASE)
1362 #undef CASE
1363     case AARCH64_NUM_SVPATTERNS:
1364       break;
1365     }
1366   gcc_unreachable ();
1367 }
1368
1369 /* Return the descriptor of the SIMD ABI.  */
1370
1371 static const predefined_function_abi &
1372 aarch64_simd_abi (void)
1373 {
1374   predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
1375   if (!simd_abi.initialized_p ())
1376     {
1377       HARD_REG_SET full_reg_clobbers
1378         = default_function_abi.full_reg_clobbers ();
1379       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1380         if (FP_SIMD_SAVED_REGNUM_P (regno))
1381           CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1382       simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
1383     }
1384   return simd_abi;
1385 }
1386
1387 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1388 const char *
1389 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1390                         const char * branch_format)
1391 {
1392     rtx_code_label * tmp_label = gen_label_rtx ();
1393     char label_buf[256];
1394     char buffer[128];
1395     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1396                                  CODE_LABEL_NUMBER (tmp_label));
1397     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1398     rtx dest_label = operands[pos_label];
1399     operands[pos_label] = tmp_label;
1400
1401     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1402     output_asm_insn (buffer, operands);
1403
1404     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1405     operands[pos_label] = dest_label;
1406     output_asm_insn (buffer, operands);
1407     return "";
1408 }
1409
1410 void
1411 aarch64_err_no_fpadvsimd (machine_mode mode)
1412 {
1413   if (TARGET_GENERAL_REGS_ONLY)
1414     if (FLOAT_MODE_P (mode))
1415       error ("%qs is incompatible with the use of floating-point types",
1416              "-mgeneral-regs-only");
1417     else
1418       error ("%qs is incompatible with the use of vector types",
1419              "-mgeneral-regs-only");
1420   else
1421     if (FLOAT_MODE_P (mode))
1422       error ("%qs feature modifier is incompatible with the use of"
1423              " floating-point types", "+nofp");
1424     else
1425       error ("%qs feature modifier is incompatible with the use of"
1426              " vector types", "+nofp");
1427 }
1428
1429 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1430    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1431    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1432    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1433    and GENERAL_REGS is lower than the memory cost (in this case the best class
1434    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1435    cost results in bad allocations with many redundant int<->FP moves which
1436    are expensive on various cores.
1437    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1438    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1439    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1440    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1441    The result of this is that it is no longer inefficient to have a higher
1442    memory move cost than the register move cost.
1443 */
1444
1445 static reg_class_t
1446 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1447                                          reg_class_t best_class)
1448 {
1449   machine_mode mode;
1450
1451   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1452       || !reg_class_subset_p (FP_REGS, allocno_class))
1453     return allocno_class;
1454
1455   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1456       || !reg_class_subset_p (FP_REGS, best_class))
1457     return best_class;
1458
1459   mode = PSEUDO_REGNO_MODE (regno);
1460   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1461 }
1462
1463 static unsigned int
1464 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1465 {
1466   if (GET_MODE_UNIT_SIZE (mode) == 4)
1467     return aarch64_tune_params.min_div_recip_mul_sf;
1468   return aarch64_tune_params.min_div_recip_mul_df;
1469 }
1470
1471 /* Return the reassociation width of treeop OPC with mode MODE.  */
1472 static int
1473 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1474 {
1475   if (VECTOR_MODE_P (mode))
1476     return aarch64_tune_params.vec_reassoc_width;
1477   if (INTEGRAL_MODE_P (mode))
1478     return aarch64_tune_params.int_reassoc_width;
1479   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1480   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1481     return aarch64_tune_params.fp_reassoc_width;
1482   return 1;
1483 }
1484
1485 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1486 unsigned
1487 aarch64_dbx_register_number (unsigned regno)
1488 {
1489    if (GP_REGNUM_P (regno))
1490      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1491    else if (regno == SP_REGNUM)
1492      return AARCH64_DWARF_SP;
1493    else if (FP_REGNUM_P (regno))
1494      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1495    else if (PR_REGNUM_P (regno))
1496      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1497    else if (regno == VG_REGNUM)
1498      return AARCH64_DWARF_VG;
1499
1500    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1501       equivalent DWARF register.  */
1502    return DWARF_FRAME_REGISTERS;
1503 }
1504
1505 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1506    integer, otherwise return X unmodified.  */
1507 static rtx
1508 aarch64_bit_representation (rtx x)
1509 {
1510   if (CONST_DOUBLE_P (x))
1511     x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
1512   return x;
1513 }
1514
1515 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1516 static bool
1517 aarch64_advsimd_struct_mode_p (machine_mode mode)
1518 {
1519   return (TARGET_SIMD
1520           && (mode == OImode || mode == CImode || mode == XImode));
1521 }
1522
1523 /* Return true if MODE is an SVE predicate mode.  */
1524 static bool
1525 aarch64_sve_pred_mode_p (machine_mode mode)
1526 {
1527   return (TARGET_SVE
1528           && (mode == VNx16BImode
1529               || mode == VNx8BImode
1530               || mode == VNx4BImode
1531               || mode == VNx2BImode));
1532 }
1533
1534 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1535 const unsigned int VEC_ADVSIMD  = 1;
1536 const unsigned int VEC_SVE_DATA = 2;
1537 const unsigned int VEC_SVE_PRED = 4;
1538 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1539    a structure of 2, 3 or 4 vectors.  */
1540 const unsigned int VEC_STRUCT   = 8;
1541 /* Useful combinations of the above.  */
1542 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1543 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1544
1545 /* Return a set of flags describing the vector properties of mode MODE.
1546    Ignore modes that are not supported by the current target.  */
1547 static unsigned int
1548 aarch64_classify_vector_mode (machine_mode mode)
1549 {
1550   if (aarch64_advsimd_struct_mode_p (mode))
1551     return VEC_ADVSIMD | VEC_STRUCT;
1552
1553   if (aarch64_sve_pred_mode_p (mode))
1554     return VEC_SVE_PRED;
1555
1556   /* Make the decision based on the mode's enum value rather than its
1557      properties, so that we keep the correct classification regardless
1558      of -msve-vector-bits.  */
1559   switch (mode)
1560     {
1561     /* Single SVE vectors.  */
1562     case E_VNx16QImode:
1563     case E_VNx8HImode:
1564     case E_VNx4SImode:
1565     case E_VNx2DImode:
1566     case E_VNx8HFmode:
1567     case E_VNx4SFmode:
1568     case E_VNx2DFmode:
1569       return TARGET_SVE ? VEC_SVE_DATA : 0;
1570
1571     /* x2 SVE vectors.  */
1572     case E_VNx32QImode:
1573     case E_VNx16HImode:
1574     case E_VNx8SImode:
1575     case E_VNx4DImode:
1576     case E_VNx16HFmode:
1577     case E_VNx8SFmode:
1578     case E_VNx4DFmode:
1579     /* x3 SVE vectors.  */
1580     case E_VNx48QImode:
1581     case E_VNx24HImode:
1582     case E_VNx12SImode:
1583     case E_VNx6DImode:
1584     case E_VNx24HFmode:
1585     case E_VNx12SFmode:
1586     case E_VNx6DFmode:
1587     /* x4 SVE vectors.  */
1588     case E_VNx64QImode:
1589     case E_VNx32HImode:
1590     case E_VNx16SImode:
1591     case E_VNx8DImode:
1592     case E_VNx32HFmode:
1593     case E_VNx16SFmode:
1594     case E_VNx8DFmode:
1595       return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
1596
1597     /* 64-bit Advanced SIMD vectors.  */
1598     case E_V8QImode:
1599     case E_V4HImode:
1600     case E_V2SImode:
1601     /* ...E_V1DImode doesn't exist.  */
1602     case E_V4HFmode:
1603     case E_V2SFmode:
1604     case E_V1DFmode:
1605     /* 128-bit Advanced SIMD vectors.  */
1606     case E_V16QImode:
1607     case E_V8HImode:
1608     case E_V4SImode:
1609     case E_V2DImode:
1610     case E_V8HFmode:
1611     case E_V4SFmode:
1612     case E_V2DFmode:
1613       return TARGET_SIMD ? VEC_ADVSIMD : 0;
1614
1615     default:
1616       return 0;
1617     }
1618 }
1619
1620 /* Return true if MODE is any of the data vector modes, including
1621    structure modes.  */
1622 static bool
1623 aarch64_vector_data_mode_p (machine_mode mode)
1624 {
1625   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1626 }
1627
1628 /* Return true if MODE is any form of SVE mode, including predicates,
1629    vectors and structures.  */
1630 bool
1631 aarch64_sve_mode_p (machine_mode mode)
1632 {
1633   return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
1634 }
1635
1636 /* Return true if MODE is an SVE data vector mode; either a single vector
1637    or a structure of vectors.  */
1638 static bool
1639 aarch64_sve_data_mode_p (machine_mode mode)
1640 {
1641   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1642 }
1643
1644 /* Implement target hook TARGET_ARRAY_MODE.  */
1645 static opt_machine_mode
1646 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1647 {
1648   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1649       && IN_RANGE (nelems, 2, 4))
1650     return mode_for_vector (GET_MODE_INNER (mode),
1651                             GET_MODE_NUNITS (mode) * nelems);
1652
1653   return opt_machine_mode ();
1654 }
1655
1656 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1657 static bool
1658 aarch64_array_mode_supported_p (machine_mode mode,
1659                                 unsigned HOST_WIDE_INT nelems)
1660 {
1661   if (TARGET_SIMD
1662       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1663           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1664       && (nelems >= 2 && nelems <= 4))
1665     return true;
1666
1667   return false;
1668 }
1669
1670 /* Return the SVE predicate mode to use for elements that have
1671    ELEM_NBYTES bytes, if such a mode exists.  */
1672
1673 opt_machine_mode
1674 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1675 {
1676   if (TARGET_SVE)
1677     {
1678       if (elem_nbytes == 1)
1679         return VNx16BImode;
1680       if (elem_nbytes == 2)
1681         return VNx8BImode;
1682       if (elem_nbytes == 4)
1683         return VNx4BImode;
1684       if (elem_nbytes == 8)
1685         return VNx2BImode;
1686     }
1687   return opt_machine_mode ();
1688 }
1689
1690 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1691
1692 static opt_machine_mode
1693 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1694 {
1695   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1696     {
1697       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1698       machine_mode pred_mode;
1699       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1700         return pred_mode;
1701     }
1702
1703   return default_get_mask_mode (nunits, nbytes);
1704 }
1705
1706 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE.  */
1707
1708 static opt_machine_mode
1709 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
1710 {
1711   enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
1712                             ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
1713   machine_mode mode;
1714   FOR_EACH_MODE_IN_CLASS (mode, mclass)
1715     if (inner_mode == GET_MODE_INNER (mode)
1716         && known_eq (nunits, GET_MODE_NUNITS (mode))
1717         && aarch64_sve_data_mode_p (mode))
1718       return mode;
1719   return opt_machine_mode ();
1720 }
1721
1722 /* Return the integer element mode associated with SVE mode MODE.  */
1723
1724 static scalar_int_mode
1725 aarch64_sve_element_int_mode (machine_mode mode)
1726 {
1727   unsigned int elt_bits = vector_element_size (BITS_PER_SVE_VECTOR,
1728                                                GET_MODE_NUNITS (mode));
1729   return int_mode_for_size (elt_bits, 0).require ();
1730 }
1731
1732 /* Return the integer vector mode associated with SVE mode MODE.
1733    Unlike mode_for_int_vector, this can handle the case in which
1734    MODE is a predicate (and thus has a different total size).  */
1735
1736 static machine_mode
1737 aarch64_sve_int_mode (machine_mode mode)
1738 {
1739   scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
1740   return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
1741 }
1742
1743 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
1744    prefer to use the first arithmetic operand as the else value if
1745    the else value doesn't matter, since that exactly matches the SVE
1746    destructive merging form.  For ternary operations we could either
1747    pick the first operand and use FMAD-like instructions or the last
1748    operand and use FMLA-like instructions; the latter seems more
1749    natural.  */
1750
1751 static tree
1752 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1753 {
1754   return nops == 3 ? ops[2] : ops[0];
1755 }
1756
1757 /* Implement TARGET_HARD_REGNO_NREGS.  */
1758
1759 static unsigned int
1760 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1761 {
1762   /* ??? Logically we should only need to provide a value when
1763      HARD_REGNO_MODE_OK says that the combination is valid,
1764      but at the moment we need to handle all modes.  Just ignore
1765      any runtime parts for registers that can't store them.  */
1766   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1767   switch (aarch64_regno_regclass (regno))
1768     {
1769     case FP_REGS:
1770     case FP_LO_REGS:
1771     case FP_LO8_REGS:
1772       if (aarch64_sve_data_mode_p (mode))
1773         return exact_div (GET_MODE_SIZE (mode),
1774                           BYTES_PER_SVE_VECTOR).to_constant ();
1775       return CEIL (lowest_size, UNITS_PER_VREG);
1776     case PR_REGS:
1777     case PR_LO_REGS:
1778     case PR_HI_REGS:
1779       return 1;
1780     default:
1781       return CEIL (lowest_size, UNITS_PER_WORD);
1782     }
1783   gcc_unreachable ();
1784 }
1785
1786 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1787
1788 static bool
1789 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1790 {
1791   if (GET_MODE_CLASS (mode) == MODE_CC)
1792     return regno == CC_REGNUM;
1793
1794   if (regno == VG_REGNUM)
1795     /* This must have the same size as _Unwind_Word.  */
1796     return mode == DImode;
1797
1798   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1799   if (vec_flags & VEC_SVE_PRED)
1800     return PR_REGNUM_P (regno);
1801
1802   if (PR_REGNUM_P (regno))
1803     return 0;
1804
1805   if (regno == SP_REGNUM)
1806     /* The purpose of comparing with ptr_mode is to support the
1807        global register variable associated with the stack pointer
1808        register via the syntax of asm ("wsp") in ILP32.  */
1809     return mode == Pmode || mode == ptr_mode;
1810
1811   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1812     return mode == Pmode;
1813
1814   if (GP_REGNUM_P (regno))
1815     {
1816       if (known_le (GET_MODE_SIZE (mode), 8))
1817         return true;
1818       else if (known_le (GET_MODE_SIZE (mode), 16))
1819         return (regno & 1) == 0;
1820     }
1821   else if (FP_REGNUM_P (regno))
1822     {
1823       if (vec_flags & VEC_STRUCT)
1824         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1825       else
1826         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1827     }
1828
1829   return false;
1830 }
1831
1832 /* Implement TARGET_FNTYPE_ABI.  */
1833
1834 static const predefined_function_abi &
1835 aarch64_fntype_abi (const_tree fntype)
1836 {
1837   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
1838     return aarch64_simd_abi ();
1839   return default_function_abi;
1840 }
1841
1842 /* Return true if this is a definition of a vectorized simd function.  */
1843
1844 static bool
1845 aarch64_simd_decl_p (tree fndecl)
1846 {
1847   tree fntype;
1848
1849   if (fndecl == NULL)
1850     return false;
1851   fntype = TREE_TYPE (fndecl);
1852   if (fntype == NULL)
1853     return false;
1854
1855   /* Functions with the aarch64_vector_pcs attribute use the simd ABI.  */
1856   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1857     return true;
1858
1859   return false;
1860 }
1861
1862 /* Return the mode a register save/restore should use.  DImode for integer
1863    registers, DFmode for FP registers in non-SIMD functions (they only save
1864    the bottom half of a 128 bit register), or TFmode for FP registers in
1865    SIMD functions.  */
1866
1867 static machine_mode
1868 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1869 {
1870   return GP_REGNUM_P (regno)
1871            ? E_DImode
1872            : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1873 }
1874
1875 /* Return true if the instruction is a call to a SIMD function, false
1876    if it is not a SIMD function or if we do not know anything about
1877    the function.  */
1878
1879 static bool
1880 aarch64_simd_call_p (const rtx_insn *insn)
1881 {
1882   rtx symbol;
1883   rtx call;
1884   tree fndecl;
1885
1886   gcc_assert (CALL_P (insn));
1887   call = get_call_rtx_from (insn);
1888   symbol = XEXP (XEXP (call, 0), 0);
1889   if (GET_CODE (symbol) != SYMBOL_REF)
1890     return false;
1891   fndecl = SYMBOL_REF_DECL (symbol);
1892   if (!fndecl)
1893     return false;
1894
1895   return aarch64_simd_decl_p (fndecl);
1896 }
1897
1898 /* Implement TARGET_INSN_CALLEE_ABI.  */
1899
1900 const predefined_function_abi &
1901 aarch64_insn_callee_abi (const rtx_insn *insn)
1902 {
1903   if (aarch64_simd_call_p (insn))
1904     return aarch64_simd_abi ();
1905   return default_function_abi;
1906 }
1907
1908 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1909    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1910    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1911
1912 static bool
1913 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
1914                                         unsigned int regno,
1915                                         machine_mode mode)
1916 {
1917   if (FP_REGNUM_P (regno))
1918     {
1919       bool simd_p = (abi_id == ARM_PCS_SIMD);
1920       poly_int64 per_register_size = GET_MODE_SIZE (mode);
1921       unsigned int nregs = hard_regno_nregs (regno, mode);
1922       if (nregs > 1)
1923         per_register_size = exact_div (per_register_size, nregs);
1924       return maybe_gt (per_register_size, simd_p ? 16 : 8);
1925     }
1926   return false;
1927 }
1928
1929 /* Implement REGMODE_NATURAL_SIZE.  */
1930 poly_uint64
1931 aarch64_regmode_natural_size (machine_mode mode)
1932 {
1933   /* The natural size for SVE data modes is one SVE data vector,
1934      and similarly for predicates.  We can't independently modify
1935      anything smaller than that.  */
1936   /* ??? For now, only do this for variable-width SVE registers.
1937      Doing it for constant-sized registers breaks lower-subreg.c.  */
1938   /* ??? And once that's fixed, we should probably have similar
1939      code for Advanced SIMD.  */
1940   if (!aarch64_sve_vg.is_constant ())
1941     {
1942       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1943       if (vec_flags & VEC_SVE_PRED)
1944         return BYTES_PER_SVE_PRED;
1945       if (vec_flags & VEC_SVE_DATA)
1946         return BYTES_PER_SVE_VECTOR;
1947     }
1948   return UNITS_PER_WORD;
1949 }
1950
1951 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1952 machine_mode
1953 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1954                                      machine_mode mode)
1955 {
1956   /* The predicate mode determines which bits are significant and
1957      which are "don't care".  Decreasing the number of lanes would
1958      lose data while increasing the number of lanes would make bits
1959      unnecessarily significant.  */
1960   if (PR_REGNUM_P (regno))
1961     return mode;
1962   if (known_ge (GET_MODE_SIZE (mode), 4))
1963     return mode;
1964   else
1965     return SImode;
1966 }
1967
1968 /* Return true if I's bits are consecutive ones from the MSB.  */
1969 bool
1970 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1971 {
1972   return exact_log2 (-i) != HOST_WIDE_INT_M1;
1973 }
1974
1975 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1976    that strcpy from constants will be faster.  */
1977
1978 static HOST_WIDE_INT
1979 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1980 {
1981   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1982     return MAX (align, BITS_PER_WORD);
1983   return align;
1984 }
1985
1986 /* Return true if calls to DECL should be treated as
1987    long-calls (ie called via a register).  */
1988 static bool
1989 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1990 {
1991   return false;
1992 }
1993
1994 /* Return true if calls to symbol-ref SYM should be treated as
1995    long-calls (ie called via a register).  */
1996 bool
1997 aarch64_is_long_call_p (rtx sym)
1998 {
1999   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
2000 }
2001
2002 /* Return true if calls to symbol-ref SYM should not go through
2003    plt stubs.  */
2004
2005 bool
2006 aarch64_is_noplt_call_p (rtx sym)
2007 {
2008   const_tree decl = SYMBOL_REF_DECL (sym);
2009
2010   if (flag_pic
2011       && decl
2012       && (!flag_plt
2013           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
2014       && !targetm.binds_local_p (decl))
2015     return true;
2016
2017   return false;
2018 }
2019
2020 /* Return true if the offsets to a zero/sign-extract operation
2021    represent an expression that matches an extend operation.  The
2022    operands represent the paramters from
2023
2024    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
2025 bool
2026 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
2027                                 rtx extract_imm)
2028 {
2029   HOST_WIDE_INT mult_val, extract_val;
2030
2031   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
2032     return false;
2033
2034   mult_val = INTVAL (mult_imm);
2035   extract_val = INTVAL (extract_imm);
2036
2037   if (extract_val > 8
2038       && extract_val < GET_MODE_BITSIZE (mode)
2039       && exact_log2 (extract_val & ~7) > 0
2040       && (extract_val & 7) <= 4
2041       && mult_val == (1 << (extract_val & 7)))
2042     return true;
2043
2044   return false;
2045 }
2046
2047 /* Emit an insn that's a simple single-set.  Both the operands must be
2048    known to be valid.  */
2049 inline static rtx_insn *
2050 emit_set_insn (rtx x, rtx y)
2051 {
2052   return emit_insn (gen_rtx_SET (x, y));
2053 }
2054
2055 /* X and Y are two things to compare using CODE.  Emit the compare insn and
2056    return the rtx for register 0 in the proper mode.  */
2057 rtx
2058 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2059 {
2060   machine_mode cmp_mode = GET_MODE (x);
2061   machine_mode cc_mode;
2062   rtx cc_reg;
2063
2064   if (cmp_mode == TImode)
2065     {
2066       gcc_assert (code == NE);
2067
2068       cc_mode = CCmode;
2069       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2070
2071       rtx x_lo = operand_subword (x, 0, 0, TImode);
2072       rtx y_lo = operand_subword (y, 0, 0, TImode);
2073       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
2074
2075       rtx x_hi = operand_subword (x, 1, 0, TImode);
2076       rtx y_hi = operand_subword (y, 1, 0, TImode);
2077       emit_insn (gen_ccmpdi (cc_reg, cc_reg, x_hi, y_hi,
2078                              gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
2079                              GEN_INT (AARCH64_EQ)));
2080     }
2081   else
2082     {
2083       cc_mode = SELECT_CC_MODE (code, x, y);
2084       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2085       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
2086     }
2087   return cc_reg;
2088 }
2089
2090 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
2091
2092 static rtx
2093 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2094                                   machine_mode y_mode)
2095 {
2096   if (y_mode == E_QImode || y_mode == E_HImode)
2097     {
2098       if (CONST_INT_P (y))
2099         y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2100       else
2101         {
2102           rtx t, cc_reg;
2103           machine_mode cc_mode;
2104
2105           t = gen_rtx_ZERO_EXTEND (SImode, y);
2106           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2107           cc_mode = CC_SWPmode;
2108           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2109           emit_set_insn (cc_reg, t);
2110           return cc_reg;
2111         }
2112     }
2113
2114   if (!aarch64_plus_operand (y, y_mode))
2115     y = force_reg (y_mode, y);
2116
2117   return aarch64_gen_compare_reg (code, x, y);
2118 }
2119
2120 /* Build the SYMBOL_REF for __tls_get_addr.  */
2121
2122 static GTY(()) rtx tls_get_addr_libfunc;
2123
2124 rtx
2125 aarch64_tls_get_addr (void)
2126 {
2127   if (!tls_get_addr_libfunc)
2128     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2129   return tls_get_addr_libfunc;
2130 }
2131
2132 /* Return the TLS model to use for ADDR.  */
2133
2134 static enum tls_model
2135 tls_symbolic_operand_type (rtx addr)
2136 {
2137   enum tls_model tls_kind = TLS_MODEL_NONE;
2138   if (GET_CODE (addr) == CONST)
2139     {
2140       poly_int64 addend;
2141       rtx sym = strip_offset (addr, &addend);
2142       if (GET_CODE (sym) == SYMBOL_REF)
2143         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2144     }
2145   else if (GET_CODE (addr) == SYMBOL_REF)
2146     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2147
2148   return tls_kind;
2149 }
2150
2151 /* We'll allow lo_sum's in addresses in our legitimate addresses
2152    so that combine would take care of combining addresses where
2153    necessary, but for generation purposes, we'll generate the address
2154    as :
2155    RTL                               Absolute
2156    tmp = hi (symbol_ref);            adrp  x1, foo
2157    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
2158                                      nop
2159
2160    PIC                               TLS
2161    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
2162    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
2163                                      bl   __tls_get_addr
2164                                      nop
2165
2166    Load TLS symbol, depending on TLS mechanism and TLS access model.
2167
2168    Global Dynamic - Traditional TLS:
2169    adrp tmp, :tlsgd:imm
2170    add  dest, tmp, #:tlsgd_lo12:imm
2171    bl   __tls_get_addr
2172
2173    Global Dynamic - TLS Descriptors:
2174    adrp dest, :tlsdesc:imm
2175    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
2176    add  dest, dest, #:tlsdesc_lo12:imm
2177    blr  tmp
2178    mrs  tp, tpidr_el0
2179    add  dest, dest, tp
2180
2181    Initial Exec:
2182    mrs  tp, tpidr_el0
2183    adrp tmp, :gottprel:imm
2184    ldr  dest, [tmp, #:gottprel_lo12:imm]
2185    add  dest, dest, tp
2186
2187    Local Exec:
2188    mrs  tp, tpidr_el0
2189    add  t0, tp, #:tprel_hi12:imm, lsl #12
2190    add  t0, t0, #:tprel_lo12_nc:imm
2191 */
2192
2193 static void
2194 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2195                                    enum aarch64_symbol_type type)
2196 {
2197   switch (type)
2198     {
2199     case SYMBOL_SMALL_ABSOLUTE:
2200       {
2201         /* In ILP32, the mode of dest can be either SImode or DImode.  */
2202         rtx tmp_reg = dest;
2203         machine_mode mode = GET_MODE (dest);
2204
2205         gcc_assert (mode == Pmode || mode == ptr_mode);
2206
2207         if (can_create_pseudo_p ())
2208           tmp_reg = gen_reg_rtx (mode);
2209
2210         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2211         emit_insn (gen_add_losym (dest, tmp_reg, imm));
2212         return;
2213       }
2214
2215     case SYMBOL_TINY_ABSOLUTE:
2216       emit_insn (gen_rtx_SET (dest, imm));
2217       return;
2218
2219     case SYMBOL_SMALL_GOT_28K:
2220       {
2221         machine_mode mode = GET_MODE (dest);
2222         rtx gp_rtx = pic_offset_table_rtx;
2223         rtx insn;
2224         rtx mem;
2225
2226         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2227            here before rtl expand.  Tree IVOPT will generate rtl pattern to
2228            decide rtx costs, in which case pic_offset_table_rtx is not
2229            initialized.  For that case no need to generate the first adrp
2230            instruction as the final cost for global variable access is
2231            one instruction.  */
2232         if (gp_rtx != NULL)
2233           {
2234             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2235                using the page base as GOT base, the first page may be wasted,
2236                in the worst scenario, there is only 28K space for GOT).
2237
2238                The generate instruction sequence for accessing global variable
2239                is:
2240
2241                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2242
2243                Only one instruction needed. But we must initialize
2244                pic_offset_table_rtx properly.  We generate initialize insn for
2245                every global access, and allow CSE to remove all redundant.
2246
2247                The final instruction sequences will look like the following
2248                for multiply global variables access.
2249
2250                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2251
2252                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2253                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2254                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2255                  ...  */
2256
2257             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2258             crtl->uses_pic_offset_table = 1;
2259             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2260
2261             if (mode != GET_MODE (gp_rtx))
2262              gp_rtx = gen_lowpart (mode, gp_rtx);
2263
2264           }
2265
2266         if (mode == ptr_mode)
2267           {
2268             if (mode == DImode)
2269               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2270             else
2271               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2272
2273             mem = XVECEXP (SET_SRC (insn), 0, 0);
2274           }
2275         else
2276           {
2277             gcc_assert (mode == Pmode);
2278
2279             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2280             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2281           }
2282
2283         /* The operand is expected to be MEM.  Whenever the related insn
2284            pattern changed, above code which calculate mem should be
2285            updated.  */
2286         gcc_assert (GET_CODE (mem) == MEM);
2287         MEM_READONLY_P (mem) = 1;
2288         MEM_NOTRAP_P (mem) = 1;
2289         emit_insn (insn);
2290         return;
2291       }
2292
2293     case SYMBOL_SMALL_GOT_4G:
2294       {
2295         /* In ILP32, the mode of dest can be either SImode or DImode,
2296            while the got entry is always of SImode size.  The mode of
2297            dest depends on how dest is used: if dest is assigned to a
2298            pointer (e.g. in the memory), it has SImode; it may have
2299            DImode if dest is dereferenced to access the memeory.
2300            This is why we have to handle three different ldr_got_small
2301            patterns here (two patterns for ILP32).  */
2302
2303         rtx insn;
2304         rtx mem;
2305         rtx tmp_reg = dest;
2306         machine_mode mode = GET_MODE (dest);
2307
2308         if (can_create_pseudo_p ())
2309           tmp_reg = gen_reg_rtx (mode);
2310
2311         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2312         if (mode == ptr_mode)
2313           {
2314             if (mode == DImode)
2315               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2316             else
2317               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2318
2319             mem = XVECEXP (SET_SRC (insn), 0, 0);
2320           }
2321         else
2322           {
2323             gcc_assert (mode == Pmode);
2324
2325             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2326             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2327           }
2328
2329         gcc_assert (GET_CODE (mem) == MEM);
2330         MEM_READONLY_P (mem) = 1;
2331         MEM_NOTRAP_P (mem) = 1;
2332         emit_insn (insn);
2333         return;
2334       }
2335
2336     case SYMBOL_SMALL_TLSGD:
2337       {
2338         rtx_insn *insns;
2339         machine_mode mode = GET_MODE (dest);
2340         rtx result = gen_rtx_REG (mode, R0_REGNUM);
2341
2342         start_sequence ();
2343         if (TARGET_ILP32)
2344           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2345         else
2346           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2347         insns = get_insns ();
2348         end_sequence ();
2349
2350         RTL_CONST_CALL_P (insns) = 1;
2351         emit_libcall_block (insns, dest, result, imm);
2352         return;
2353       }
2354
2355     case SYMBOL_SMALL_TLSDESC:
2356       {
2357         machine_mode mode = GET_MODE (dest);
2358         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2359         rtx tp;
2360
2361         gcc_assert (mode == Pmode || mode == ptr_mode);
2362
2363         /* In ILP32, the got entry is always of SImode size.  Unlike
2364            small GOT, the dest is fixed at reg 0.  */
2365         if (TARGET_ILP32)
2366           emit_insn (gen_tlsdesc_small_si (imm));
2367         else
2368           emit_insn (gen_tlsdesc_small_di (imm));
2369         tp = aarch64_load_tp (NULL);
2370
2371         if (mode != Pmode)
2372           tp = gen_lowpart (mode, tp);
2373
2374         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2375         if (REG_P (dest))
2376           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2377         return;
2378       }
2379
2380     case SYMBOL_SMALL_TLSIE:
2381       {
2382         /* In ILP32, the mode of dest can be either SImode or DImode,
2383            while the got entry is always of SImode size.  The mode of
2384            dest depends on how dest is used: if dest is assigned to a
2385            pointer (e.g. in the memory), it has SImode; it may have
2386            DImode if dest is dereferenced to access the memeory.
2387            This is why we have to handle three different tlsie_small
2388            patterns here (two patterns for ILP32).  */
2389         machine_mode mode = GET_MODE (dest);
2390         rtx tmp_reg = gen_reg_rtx (mode);
2391         rtx tp = aarch64_load_tp (NULL);
2392
2393         if (mode == ptr_mode)
2394           {
2395             if (mode == DImode)
2396               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2397             else
2398               {
2399                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2400                 tp = gen_lowpart (mode, tp);
2401               }
2402           }
2403         else
2404           {
2405             gcc_assert (mode == Pmode);
2406             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2407           }
2408
2409         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2410         if (REG_P (dest))
2411           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2412         return;
2413       }
2414
2415     case SYMBOL_TLSLE12:
2416     case SYMBOL_TLSLE24:
2417     case SYMBOL_TLSLE32:
2418     case SYMBOL_TLSLE48:
2419       {
2420         machine_mode mode = GET_MODE (dest);
2421         rtx tp = aarch64_load_tp (NULL);
2422
2423         if (mode != Pmode)
2424           tp = gen_lowpart (mode, tp);
2425
2426         switch (type)
2427           {
2428           case SYMBOL_TLSLE12:
2429             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2430                         (dest, tp, imm));
2431             break;
2432           case SYMBOL_TLSLE24:
2433             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2434                         (dest, tp, imm));
2435           break;
2436           case SYMBOL_TLSLE32:
2437             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2438                         (dest, imm));
2439             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2440                         (dest, dest, tp));
2441           break;
2442           case SYMBOL_TLSLE48:
2443             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2444                         (dest, imm));
2445             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2446                         (dest, dest, tp));
2447             break;
2448           default:
2449             gcc_unreachable ();
2450           }
2451
2452         if (REG_P (dest))
2453           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2454         return;
2455       }
2456
2457     case SYMBOL_TINY_GOT:
2458       emit_insn (gen_ldr_got_tiny (dest, imm));
2459       return;
2460
2461     case SYMBOL_TINY_TLSIE:
2462       {
2463         machine_mode mode = GET_MODE (dest);
2464         rtx tp = aarch64_load_tp (NULL);
2465
2466         if (mode == ptr_mode)
2467           {
2468             if (mode == DImode)
2469               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2470             else
2471               {
2472                 tp = gen_lowpart (mode, tp);
2473                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2474               }
2475           }
2476         else
2477           {
2478             gcc_assert (mode == Pmode);
2479             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2480           }
2481
2482         if (REG_P (dest))
2483           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2484         return;
2485       }
2486
2487     default:
2488       gcc_unreachable ();
2489     }
2490 }
2491
2492 /* Emit a move from SRC to DEST.  Assume that the move expanders can
2493    handle all moves if !can_create_pseudo_p ().  The distinction is
2494    important because, unlike emit_move_insn, the move expanders know
2495    how to force Pmode objects into the constant pool even when the
2496    constant pool address is not itself legitimate.  */
2497 static rtx
2498 aarch64_emit_move (rtx dest, rtx src)
2499 {
2500   return (can_create_pseudo_p ()
2501           ? emit_move_insn (dest, src)
2502           : emit_move_insn_1 (dest, src));
2503 }
2504
2505 /* Apply UNOPTAB to OP and store the result in DEST.  */
2506
2507 static void
2508 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2509 {
2510   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2511   if (dest != tmp)
2512     emit_move_insn (dest, tmp);
2513 }
2514
2515 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
2516
2517 static void
2518 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2519 {
2520   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2521                           OPTAB_DIRECT);
2522   if (dest != tmp)
2523     emit_move_insn (dest, tmp);
2524 }
2525
2526 /* Split a 128-bit move operation into two 64-bit move operations,
2527    taking care to handle partial overlap of register to register
2528    copies.  Special cases are needed when moving between GP regs and
2529    FP regs.  SRC can be a register, constant or memory; DST a register
2530    or memory.  If either operand is memory it must not have any side
2531    effects.  */
2532 void
2533 aarch64_split_128bit_move (rtx dst, rtx src)
2534 {
2535   rtx dst_lo, dst_hi;
2536   rtx src_lo, src_hi;
2537
2538   machine_mode mode = GET_MODE (dst);
2539
2540   gcc_assert (mode == TImode || mode == TFmode);
2541   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2542   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2543
2544   if (REG_P (dst) && REG_P (src))
2545     {
2546       int src_regno = REGNO (src);
2547       int dst_regno = REGNO (dst);
2548
2549       /* Handle FP <-> GP regs.  */
2550       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2551         {
2552           src_lo = gen_lowpart (word_mode, src);
2553           src_hi = gen_highpart (word_mode, src);
2554
2555           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2556           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2557           return;
2558         }
2559       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2560         {
2561           dst_lo = gen_lowpart (word_mode, dst);
2562           dst_hi = gen_highpart (word_mode, dst);
2563
2564           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2565           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2566           return;
2567         }
2568     }
2569
2570   dst_lo = gen_lowpart (word_mode, dst);
2571   dst_hi = gen_highpart (word_mode, dst);
2572   src_lo = gen_lowpart (word_mode, src);
2573   src_hi = gen_highpart_mode (word_mode, mode, src);
2574
2575   /* At most one pairing may overlap.  */
2576   if (reg_overlap_mentioned_p (dst_lo, src_hi))
2577     {
2578       aarch64_emit_move (dst_hi, src_hi);
2579       aarch64_emit_move (dst_lo, src_lo);
2580     }
2581   else
2582     {
2583       aarch64_emit_move (dst_lo, src_lo);
2584       aarch64_emit_move (dst_hi, src_hi);
2585     }
2586 }
2587
2588 bool
2589 aarch64_split_128bit_move_p (rtx dst, rtx src)
2590 {
2591   return (! REG_P (src)
2592           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2593 }
2594
2595 /* Split a complex SIMD combine.  */
2596
2597 void
2598 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2599 {
2600   machine_mode src_mode = GET_MODE (src1);
2601   machine_mode dst_mode = GET_MODE (dst);
2602
2603   gcc_assert (VECTOR_MODE_P (dst_mode));
2604   gcc_assert (register_operand (dst, dst_mode)
2605               && register_operand (src1, src_mode)
2606               && register_operand (src2, src_mode));
2607
2608   emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2609   return;
2610 }
2611
2612 /* Split a complex SIMD move.  */
2613
2614 void
2615 aarch64_split_simd_move (rtx dst, rtx src)
2616 {
2617   machine_mode src_mode = GET_MODE (src);
2618   machine_mode dst_mode = GET_MODE (dst);
2619
2620   gcc_assert (VECTOR_MODE_P (dst_mode));
2621
2622   if (REG_P (dst) && REG_P (src))
2623     {
2624       gcc_assert (VECTOR_MODE_P (src_mode));
2625       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2626     }
2627 }
2628
2629 bool
2630 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2631                               machine_mode ymode, rtx y)
2632 {
2633   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2634   gcc_assert (r != NULL);
2635   return rtx_equal_p (x, r);
2636 }
2637
2638 /* Return TARGET if it is nonnull and a register of mode MODE.
2639    Otherwise, return a fresh register of mode MODE if we can,
2640    or TARGET reinterpreted as MODE if we can't.  */
2641
2642 static rtx
2643 aarch64_target_reg (rtx target, machine_mode mode)
2644 {
2645   if (target && REG_P (target) && GET_MODE (target) == mode)
2646     return target;
2647   if (!can_create_pseudo_p ())
2648     {
2649       gcc_assert (target);
2650       return gen_lowpart (mode, target);
2651     }
2652   return gen_reg_rtx (mode);
2653 }
2654
2655 /* Return a register that contains the constant in BUILDER, given that
2656    the constant is a legitimate move operand.  Use TARGET as the register
2657    if it is nonnull and convenient.  */
2658
2659 static rtx
2660 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
2661 {
2662   rtx src = builder.build ();
2663   target = aarch64_target_reg (target, GET_MODE (src));
2664   emit_insn (gen_rtx_SET (target, src));
2665   return target;
2666 }
2667
2668 static rtx
2669 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2670 {
2671   if (can_create_pseudo_p ())
2672     return force_reg (mode, value);
2673   else
2674     {
2675       gcc_assert (x);
2676       aarch64_emit_move (x, value);
2677       return x;
2678     }
2679 }
2680
2681 /* Return true if predicate value X is a constant in which every element
2682    is a CONST_INT.  When returning true, describe X in BUILDER as a VNx16BI
2683    value, i.e. as a predicate in which all bits are significant.  */
2684
2685 static bool
2686 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
2687 {
2688   if (GET_CODE (x) != CONST_VECTOR)
2689     return false;
2690
2691   unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
2692                                              GET_MODE_NUNITS (GET_MODE (x)));
2693   unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
2694   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
2695   builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
2696
2697   unsigned int nelts = const_vector_encoded_nelts (x);
2698   for (unsigned int i = 0; i < nelts; ++i)
2699     {
2700       rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
2701       if (!CONST_INT_P (elt))
2702         return false;
2703
2704       builder.quick_push (elt);
2705       for (unsigned int j = 1; j < factor; ++j)
2706         builder.quick_push (const0_rtx);
2707     }
2708   builder.finalize ();
2709   return true;
2710 }
2711
2712 /* BUILDER contains a predicate constant of mode VNx16BI.  Return the
2713    widest predicate element size it can have (that is, the largest size
2714    for which each element would still be 0 or 1).  */
2715
2716 unsigned int
2717 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
2718 {
2719   /* Start with the most optimistic assumption: that we only need
2720      one bit per pattern.  This is what we will use if only the first
2721      bit in each pattern is ever set.  */
2722   unsigned int mask = GET_MODE_SIZE (DImode);
2723   mask |= builder.npatterns ();
2724
2725   /* Look for set bits.  */
2726   unsigned int nelts = builder.encoded_nelts ();
2727   for (unsigned int i = 1; i < nelts; ++i)
2728     if (INTVAL (builder.elt (i)) != 0)
2729       {
2730         if (i & 1)
2731           return 1;
2732         mask |= i;
2733       }
2734   return mask & -mask;
2735 }
2736
2737 /* BUILDER is a predicate constant of mode VNx16BI.  Consider the value
2738    that the constant would have with predicate element size ELT_SIZE
2739    (ignoring the upper bits in each element) and return:
2740
2741    * -1 if all bits are set
2742    * N if the predicate has N leading set bits followed by all clear bits
2743    * 0 if the predicate does not have any of these forms.  */
2744
2745 int
2746 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
2747                               unsigned int elt_size)
2748 {
2749   /* If nelts_per_pattern is 3, we have set bits followed by clear bits
2750      followed by set bits.  */
2751   if (builder.nelts_per_pattern () == 3)
2752     return 0;
2753
2754   /* Skip over leading set bits.  */
2755   unsigned int nelts = builder.encoded_nelts ();
2756   unsigned int i = 0;
2757   for (; i < nelts; i += elt_size)
2758     if (INTVAL (builder.elt (i)) == 0)
2759       break;
2760   unsigned int vl = i / elt_size;
2761
2762   /* Check for the all-true case.  */
2763   if (i == nelts)
2764     return -1;
2765
2766   /* If nelts_per_pattern is 1, then either VL is zero, or we have a
2767      repeating pattern of set bits followed by clear bits.  */
2768   if (builder.nelts_per_pattern () != 2)
2769     return 0;
2770
2771   /* We have a "foreground" value and a duplicated "background" value.
2772      If the background might repeat and the last set bit belongs to it,
2773      we might have set bits followed by clear bits followed by set bits.  */
2774   if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
2775     return 0;
2776
2777   /* Make sure that the rest are all clear.  */
2778   for (; i < nelts; i += elt_size)
2779     if (INTVAL (builder.elt (i)) != 0)
2780       return 0;
2781
2782   return vl;
2783 }
2784
2785 /* See if there is an svpattern that encodes an SVE predicate of mode
2786    PRED_MODE in which the first VL bits are set and the rest are clear.
2787    Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
2788    A VL of -1 indicates an all-true vector.  */
2789
2790 aarch64_svpattern
2791 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
2792 {
2793   if (vl < 0)
2794     return AARCH64_SV_ALL;
2795
2796   if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
2797     return AARCH64_NUM_SVPATTERNS;
2798
2799   if (vl >= 1 && vl <= 8)
2800     return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
2801
2802   if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
2803     return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
2804
2805   int max_vl;
2806   if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
2807     {
2808       if (vl == (max_vl / 3) * 3)
2809         return AARCH64_SV_MUL3;
2810       /* These would only trigger for non-power-of-2 lengths.  */
2811       if (vl == (max_vl & -4))
2812         return AARCH64_SV_MUL4;
2813       if (vl == (1 << floor_log2 (max_vl)))
2814         return AARCH64_SV_POW2;
2815       if (vl == max_vl)
2816         return AARCH64_SV_ALL;
2817     }
2818   return AARCH64_NUM_SVPATTERNS;
2819 }
2820
2821 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
2822    bits has the lowest bit set and the upper bits clear.  This is the
2823    VNx16BImode equivalent of a PTRUE for controlling elements of
2824    ELT_SIZE bytes.  However, because the constant is VNx16BImode,
2825    all bits are significant, even the upper zeros.  */
2826
2827 rtx
2828 aarch64_ptrue_all (unsigned int elt_size)
2829 {
2830   rtx_vector_builder builder (VNx16BImode, elt_size, 1);
2831   builder.quick_push (const1_rtx);
2832   for (unsigned int i = 1; i < elt_size; ++i)
2833     builder.quick_push (const0_rtx);
2834   return builder.build ();
2835 }
2836
2837 /* Return an all-true predicate register of mode MODE.  */
2838
2839 rtx
2840 aarch64_ptrue_reg (machine_mode mode)
2841 {
2842   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2843   rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
2844   return gen_lowpart (mode, reg);
2845 }
2846
2847 /* Return an all-false predicate register of mode MODE.  */
2848
2849 rtx
2850 aarch64_pfalse_reg (machine_mode mode)
2851 {
2852   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2853   rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
2854   return gen_lowpart (mode, reg);
2855 }
2856
2857 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
2858    true, or alternatively if we know that the operation predicated by
2859    PRED1[0] is safe to perform whenever PRED2 is true.  PRED1[1] is a
2860    aarch64_sve_gp_strictness operand that describes the operation
2861    predicated by PRED1[0].  */
2862
2863 bool
2864 aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
2865 {
2866   machine_mode mode = GET_MODE (pred2);
2867   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2868               && mode == GET_MODE (pred1[0])
2869               && aarch64_sve_gp_strictness (pred1[1], SImode));
2870   return (pred1[0] == CONSTM1_RTX (mode)
2871           || INTVAL (pred1[1]) == SVE_RELAXED_GP
2872           || rtx_equal_p (pred1[0], pred2));
2873 }
2874
2875 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
2876    for it.  PRED2[0] is the predicate for the instruction whose result
2877    is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
2878    for it.  Return true if we can prove that the two predicates are
2879    equivalent for PTEST purposes; that is, if we can replace PRED2[0]
2880    with PRED1[0] without changing behavior.  */
2881
2882 bool
2883 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
2884 {
2885   machine_mode mode = GET_MODE (pred1[0]);
2886   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2887               && mode == GET_MODE (pred2[0])
2888               && aarch64_sve_ptrue_flag (pred1[1], SImode)
2889               && aarch64_sve_ptrue_flag (pred2[1], SImode));
2890
2891   bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
2892                    || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
2893   bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
2894                    || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
2895   return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
2896 }
2897
2898 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
2899    DATA_MODE, and return the result in a predicate of mode PRED_MODE.
2900    Use TARGET as the target register if nonnull and convenient.  */
2901
2902 static rtx
2903 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
2904                           machine_mode data_mode, rtx op1, rtx op2)
2905 {
2906   insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
2907   expand_operand ops[5];
2908   create_output_operand (&ops[0], target, pred_mode);
2909   create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
2910   create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
2911   create_input_operand (&ops[3], op1, data_mode);
2912   create_input_operand (&ops[4], op2, data_mode);
2913   expand_insn (icode, 5, ops);
2914   return ops[0].value;
2915 }
2916
2917 /* Use a comparison to convert integer vector SRC into MODE, which is
2918    the corresponding SVE predicate mode.  Use TARGET for the result
2919    if it's nonnull and convenient.  */
2920
2921 static rtx
2922 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
2923 {
2924   machine_mode src_mode = GET_MODE (src);
2925   return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
2926                                    src, CONST0_RTX (src_mode));
2927 }
2928
2929 /* Return true if we can move VALUE into a register using a single
2930    CNT[BHWD] instruction.  */
2931
2932 static bool
2933 aarch64_sve_cnt_immediate_p (poly_int64 value)
2934 {
2935   HOST_WIDE_INT factor = value.coeffs[0];
2936   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2937   return (value.coeffs[1] == factor
2938           && IN_RANGE (factor, 2, 16 * 16)
2939           && (factor & 1) == 0
2940           && factor <= 16 * (factor & -factor));
2941 }
2942
2943 /* Likewise for rtx X.  */
2944
2945 bool
2946 aarch64_sve_cnt_immediate_p (rtx x)
2947 {
2948   poly_int64 value;
2949   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2950 }
2951
2952 /* Return the asm string for an instruction with a CNT-like vector size
2953    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2954    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2955    first part of the operands template (the part that comes before the
2956    vector size itself).  PATTERN is the pattern to use.  FACTOR is the
2957    number of quadwords.  NELTS_PER_VQ, if nonzero, is the number of elements
2958    in each quadword.  If it is zero, we can use any element size.  */
2959
2960 static char *
2961 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2962                                   aarch64_svpattern pattern,
2963                                   unsigned int factor,
2964                                   unsigned int nelts_per_vq)
2965 {
2966   static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
2967
2968   if (nelts_per_vq == 0)
2969     /* There is some overlap in the ranges of the four CNT instructions.
2970        Here we always use the smallest possible element size, so that the
2971        multiplier is 1 whereever possible.  */
2972     nelts_per_vq = factor & -factor;
2973   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2974   gcc_assert (IN_RANGE (shift, 1, 4));
2975   char suffix = "dwhb"[shift - 1];
2976
2977   factor >>= shift;
2978   unsigned int written;
2979   if (pattern == AARCH64_SV_ALL && factor == 1)
2980     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2981                         prefix, suffix, operands);
2982   else if (factor == 1)
2983     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
2984                         prefix, suffix, operands, svpattern_token (pattern));
2985   else
2986     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
2987                         prefix, suffix, operands, svpattern_token (pattern),
2988                         factor);
2989   gcc_assert (written < sizeof (buffer));
2990   return buffer;
2991 }
2992
2993 /* Return the asm string for an instruction with a CNT-like vector size
2994    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2995    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2996    first part of the operands template (the part that comes before the
2997    vector size itself).  X is the value of the vector size operand,
2998    as a polynomial integer rtx; we need to convert this into an "all"
2999    pattern with a multiplier.  */
3000
3001 char *
3002 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3003                                   rtx x)
3004 {
3005   poly_int64 value = rtx_to_poly_int64 (x);
3006   gcc_assert (aarch64_sve_cnt_immediate_p (value));
3007   return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
3008                                            value.coeffs[1], 0);
3009 }
3010
3011 /* Return true if we can add X using a single SVE INC or DEC instruction.  */
3012
3013 bool
3014 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
3015 {
3016   poly_int64 value;
3017   return (poly_int_rtx_p (x, &value)
3018           && (aarch64_sve_cnt_immediate_p (value)
3019               || aarch64_sve_cnt_immediate_p (-value)));
3020 }
3021
3022 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
3023    operand 0.  */
3024
3025 char *
3026 aarch64_output_sve_scalar_inc_dec (rtx offset)
3027 {
3028   poly_int64 offset_value = rtx_to_poly_int64 (offset);
3029   gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
3030   if (offset_value.coeffs[1] > 0)
3031     return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
3032                                              offset_value.coeffs[1], 0);
3033   else
3034     return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
3035                                              -offset_value.coeffs[1], 0);
3036 }
3037
3038 /* Return true if we can add VALUE to a register using a single ADDVL
3039    or ADDPL instruction.  */
3040
3041 static bool
3042 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
3043 {
3044   HOST_WIDE_INT factor = value.coeffs[0];
3045   if (factor == 0 || value.coeffs[1] != factor)
3046     return false;
3047   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
3048      and a value of 16 is one vector width.  */
3049   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
3050           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
3051 }
3052
3053 /* Likewise for rtx X.  */
3054
3055 bool
3056 aarch64_sve_addvl_addpl_immediate_p (rtx x)
3057 {
3058   poly_int64 value;
3059   return (poly_int_rtx_p (x, &value)
3060           && aarch64_sve_addvl_addpl_immediate_p (value));
3061 }
3062
3063 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3064    to operand 1 and storing the result in operand 0.  */
3065
3066 char *
3067 aarch64_output_sve_addvl_addpl (rtx offset)
3068 {
3069   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3070   poly_int64 offset_value = rtx_to_poly_int64 (offset);
3071   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
3072
3073   int factor = offset_value.coeffs[1];
3074   if ((factor & 15) == 0)
3075     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
3076   else
3077     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
3078   return buffer;
3079 }
3080
3081 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3082    instruction.  If it is, store the number of elements in each vector
3083    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3084    factor in *FACTOR_OUT (if nonnull).  */
3085
3086 bool
3087 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
3088                                         unsigned int *nelts_per_vq_out)
3089 {
3090   rtx elt;
3091   poly_int64 value;
3092
3093   if (!const_vec_duplicate_p (x, &elt)
3094       || !poly_int_rtx_p (elt, &value))
3095     return false;
3096
3097   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
3098   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
3099     /* There's no vector INCB.  */
3100     return false;
3101
3102   HOST_WIDE_INT factor = value.coeffs[0];
3103   if (value.coeffs[1] != factor)
3104     return false;
3105
3106   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
3107   if ((factor % nelts_per_vq) != 0
3108       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
3109     return false;
3110
3111   if (factor_out)
3112     *factor_out = factor;
3113   if (nelts_per_vq_out)
3114     *nelts_per_vq_out = nelts_per_vq;
3115   return true;
3116 }
3117
3118 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3119    instruction.  */
3120
3121 bool
3122 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
3123 {
3124   return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
3125 }
3126
3127 /* Return the asm template for an SVE vector INC or DEC instruction.
3128    OPERANDS gives the operands before the vector count and X is the
3129    value of the vector count operand itself.  */
3130
3131 char *
3132 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
3133 {
3134   int factor;
3135   unsigned int nelts_per_vq;
3136   if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
3137     gcc_unreachable ();
3138   if (factor < 0)
3139     return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
3140                                              -factor, nelts_per_vq);
3141   else
3142     return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
3143                                              factor, nelts_per_vq);
3144 }
3145
3146 static int
3147 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
3148                                 scalar_int_mode mode)
3149 {
3150   int i;
3151   unsigned HOST_WIDE_INT val, val2, mask;
3152   int one_match, zero_match;
3153   int num_insns;
3154
3155   val = INTVAL (imm);
3156
3157   if (aarch64_move_imm (val, mode))
3158     {
3159       if (generate)
3160         emit_insn (gen_rtx_SET (dest, imm));
3161       return 1;
3162     }
3163
3164   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3165      (with XXXX non-zero). In that case check to see if the move can be done in
3166      a smaller mode.  */
3167   val2 = val & 0xffffffff;
3168   if (mode == DImode
3169       && aarch64_move_imm (val2, SImode)
3170       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
3171     {
3172       if (generate)
3173         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3174
3175       /* Check if we have to emit a second instruction by checking to see
3176          if any of the upper 32 bits of the original DI mode value is set.  */
3177       if (val == val2)
3178         return 1;
3179
3180       i = (val >> 48) ? 48 : 32;
3181
3182       if (generate)
3183          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3184                                     GEN_INT ((val >> i) & 0xffff)));
3185
3186       return 2;
3187     }
3188
3189   if ((val >> 32) == 0 || mode == SImode)
3190     {
3191       if (generate)
3192         {
3193           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
3194           if (mode == SImode)
3195             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
3196                                        GEN_INT ((val >> 16) & 0xffff)));
3197           else
3198             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
3199                                        GEN_INT ((val >> 16) & 0xffff)));
3200         }
3201       return 2;
3202     }
3203
3204   /* Remaining cases are all for DImode.  */
3205
3206   mask = 0xffff;
3207   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
3208     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
3209   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
3210     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
3211
3212   if (zero_match != 2 && one_match != 2)
3213     {
3214       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3215          For a 64-bit bitmask try whether changing 16 bits to all ones or
3216          zeroes creates a valid bitmask.  To check any repeated bitmask,
3217          try using 16 bits from the other 32-bit half of val.  */
3218
3219       for (i = 0; i < 64; i += 16, mask <<= 16)
3220         {
3221           val2 = val & ~mask;
3222           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3223             break;
3224           val2 = val | mask;
3225           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3226             break;
3227           val2 = val2 & ~mask;
3228           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
3229           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3230             break;
3231         }
3232       if (i != 64)
3233         {
3234           if (generate)
3235             {
3236               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3237               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3238                                          GEN_INT ((val >> i) & 0xffff)));
3239             }
3240           return 2;
3241         }
3242     }
3243
3244   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3245      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
3246      otherwise skip zero bits.  */
3247
3248   num_insns = 1;
3249   mask = 0xffff;
3250   val2 = one_match > zero_match ? ~val : val;
3251   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
3252
3253   if (generate)
3254     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
3255                                            ? (val | ~(mask << i))
3256                                            : (val & (mask << i)))));
3257   for (i += 16; i < 64; i += 16)
3258     {
3259       if ((val2 & (mask << i)) == 0)
3260         continue;
3261       if (generate)
3262         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3263                                    GEN_INT ((val >> i) & 0xffff)));
3264       num_insns ++;
3265     }
3266
3267   return num_insns;
3268 }
3269
3270 /* Return whether imm is a 128-bit immediate which is simple enough to
3271    expand inline.  */
3272 bool
3273 aarch64_mov128_immediate (rtx imm)
3274 {
3275   if (GET_CODE (imm) == CONST_INT)
3276     return true;
3277
3278   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
3279
3280   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
3281   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
3282
3283   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
3284          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
3285 }
3286
3287
3288 /* Return the number of temporary registers that aarch64_add_offset_1
3289    would need to add OFFSET to a register.  */
3290
3291 static unsigned int
3292 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
3293 {
3294   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
3295 }
3296
3297 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
3298    a non-polynomial OFFSET.  MODE is the mode of the addition.
3299    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3300    be set and CFA adjustments added to the generated instructions.
3301
3302    TEMP1, if nonnull, is a register of mode MODE that can be used as a
3303    temporary if register allocation is already complete.  This temporary
3304    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
3305    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3306    the immediate again.
3307
3308    Since this function may be used to adjust the stack pointer, we must
3309    ensure that it cannot cause transient stack deallocation (for example
3310    by first incrementing SP and then decrementing when adjusting by a
3311    large immediate).  */
3312
3313 static void
3314 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
3315                       rtx src, HOST_WIDE_INT offset, rtx temp1,
3316                       bool frame_related_p, bool emit_move_imm)
3317 {
3318   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3319   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3320
3321   HOST_WIDE_INT moffset = abs_hwi (offset);
3322   rtx_insn *insn;
3323
3324   if (!moffset)
3325     {
3326       if (!rtx_equal_p (dest, src))
3327         {
3328           insn = emit_insn (gen_rtx_SET (dest, src));
3329           RTX_FRAME_RELATED_P (insn) = frame_related_p;
3330         }
3331       return;
3332     }
3333
3334   /* Single instruction adjustment.  */
3335   if (aarch64_uimm12_shift (moffset))
3336     {
3337       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
3338       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3339       return;
3340     }
3341
3342   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3343      and either:
3344
3345      a) the offset cannot be loaded by a 16-bit move or
3346      b) there is no spare register into which we can move it.  */
3347   if (moffset < 0x1000000
3348       && ((!temp1 && !can_create_pseudo_p ())
3349           || !aarch64_move_imm (moffset, mode)))
3350     {
3351       HOST_WIDE_INT low_off = moffset & 0xfff;
3352
3353       low_off = offset < 0 ? -low_off : low_off;
3354       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
3355       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3356       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
3357       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3358       return;
3359     }
3360
3361   /* Emit a move immediate if required and an addition/subtraction.  */
3362   if (emit_move_imm)
3363     {
3364       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
3365       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
3366     }
3367   insn = emit_insn (offset < 0
3368                     ? gen_sub3_insn (dest, src, temp1)
3369                     : gen_add3_insn (dest, src, temp1));
3370   if (frame_related_p)
3371     {
3372       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3373       rtx adj = plus_constant (mode, src, offset);
3374       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
3375     }
3376 }
3377
3378 /* Return the number of temporary registers that aarch64_add_offset
3379    would need to move OFFSET into a register or add OFFSET to a register;
3380    ADD_P is true if we want the latter rather than the former.  */
3381
3382 static unsigned int
3383 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
3384 {
3385   /* This follows the same structure as aarch64_add_offset.  */
3386   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
3387     return 0;
3388
3389   unsigned int count = 0;
3390   HOST_WIDE_INT factor = offset.coeffs[1];
3391   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3392   poly_int64 poly_offset (factor, factor);
3393   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3394     /* Need one register for the ADDVL/ADDPL result.  */
3395     count += 1;
3396   else if (factor != 0)
3397     {
3398       factor = abs (factor);
3399       if (factor > 16 * (factor & -factor))
3400         /* Need one register for the CNT result and one for the multiplication
3401            factor.  If necessary, the second temporary can be reused for the
3402            constant part of the offset.  */
3403         return 2;
3404       /* Need one register for the CNT result (which might then
3405          be shifted).  */
3406       count += 1;
3407     }
3408   return count + aarch64_add_offset_1_temporaries (constant);
3409 }
3410
3411 /* If X can be represented as a poly_int64, return the number
3412    of temporaries that are required to add it to a register.
3413    Return -1 otherwise.  */
3414
3415 int
3416 aarch64_add_offset_temporaries (rtx x)
3417 {
3418   poly_int64 offset;
3419   if (!poly_int_rtx_p (x, &offset))
3420     return -1;
3421   return aarch64_offset_temporaries (true, offset);
3422 }
3423
3424 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
3425    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3426    be set and CFA adjustments added to the generated instructions.
3427
3428    TEMP1, if nonnull, is a register of mode MODE that can be used as a
3429    temporary if register allocation is already complete.  This temporary
3430    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3431    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3432    false to avoid emitting the immediate again.
3433
3434    TEMP2, if nonnull, is a second temporary register that doesn't
3435    overlap either DEST or REG.
3436
3437    Since this function may be used to adjust the stack pointer, we must
3438    ensure that it cannot cause transient stack deallocation (for example
3439    by first incrementing SP and then decrementing when adjusting by a
3440    large immediate).  */
3441
3442 static void
3443 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3444                     poly_int64 offset, rtx temp1, rtx temp2,
3445                     bool frame_related_p, bool emit_move_imm = true)
3446 {
3447   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3448   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3449   gcc_assert (temp1 == NULL_RTX
3450               || !frame_related_p
3451               || !reg_overlap_mentioned_p (temp1, dest));
3452   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
3453
3454   /* Try using ADDVL or ADDPL to add the whole value.  */
3455   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
3456     {
3457       rtx offset_rtx = gen_int_mode (offset, mode);
3458       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3459       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3460       return;
3461     }
3462
3463   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3464      SVE vector register, over and above the minimum size of 128 bits.
3465      This is equivalent to half the value returned by CNTD with a
3466      vector shape of ALL.  */
3467   HOST_WIDE_INT factor = offset.coeffs[1];
3468   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3469
3470   /* Try using ADDVL or ADDPL to add the VG-based part.  */
3471   poly_int64 poly_offset (factor, factor);
3472   if (src != const0_rtx
3473       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3474     {
3475       rtx offset_rtx = gen_int_mode (poly_offset, mode);
3476       if (frame_related_p)
3477         {
3478           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3479           RTX_FRAME_RELATED_P (insn) = true;
3480           src = dest;
3481         }
3482       else
3483         {
3484           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
3485           src = aarch64_force_temporary (mode, temp1, addr);
3486           temp1 = temp2;
3487           temp2 = NULL_RTX;
3488         }
3489     }
3490   /* Otherwise use a CNT-based sequence.  */
3491   else if (factor != 0)
3492     {
3493       /* Use a subtraction if we have a negative factor.  */
3494       rtx_code code = PLUS;
3495       if (factor < 0)
3496         {
3497           factor = -factor;
3498           code = MINUS;
3499         }
3500
3501       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
3502          into the multiplication.  */
3503       rtx val;
3504       int shift = 0;
3505       if (factor & 1)
3506         /* Use a right shift by 1.  */
3507         shift = -1;
3508       else
3509         factor /= 2;
3510       HOST_WIDE_INT low_bit = factor & -factor;
3511       if (factor <= 16 * low_bit)
3512         {
3513           if (factor > 16 * 8)
3514             {
3515               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3516                  the value with the minimum multiplier and shift it into
3517                  position.  */
3518               int extra_shift = exact_log2 (low_bit);
3519               shift += extra_shift;
3520               factor >>= extra_shift;
3521             }
3522           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3523         }
3524       else
3525         {
3526           /* Base the factor on LOW_BIT if we can calculate LOW_BIT
3527              directly, since that should increase the chances of being
3528              able to use a shift and add sequence.  If LOW_BIT itself
3529              is out of range, just use CNTD.  */
3530           if (low_bit <= 16 * 8)
3531             factor /= low_bit;
3532           else
3533             low_bit = 1;
3534
3535           val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
3536           val = aarch64_force_temporary (mode, temp1, val);
3537
3538           if (can_create_pseudo_p ())
3539             {
3540               rtx coeff1 = gen_int_mode (factor, mode);
3541               val = expand_mult (mode, val, coeff1, NULL_RTX, false, true);
3542             }
3543           else
3544             {
3545               /* Go back to using a negative multiplication factor if we have
3546                  no register from which to subtract.  */
3547               if (code == MINUS && src == const0_rtx)
3548                 {
3549                   factor = -factor;
3550                   code = PLUS;
3551                 }
3552               rtx coeff1 = gen_int_mode (factor, mode);
3553               coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3554               val = gen_rtx_MULT (mode, val, coeff1);
3555             }
3556         }
3557
3558       if (shift > 0)
3559         {
3560           /* Multiply by 1 << SHIFT.  */
3561           val = aarch64_force_temporary (mode, temp1, val);
3562           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3563         }
3564       else if (shift == -1)
3565         {
3566           /* Divide by 2.  */
3567           val = aarch64_force_temporary (mode, temp1, val);
3568           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3569         }
3570
3571       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
3572       if (src != const0_rtx)
3573         {
3574           val = aarch64_force_temporary (mode, temp1, val);
3575           val = gen_rtx_fmt_ee (code, mode, src, val);
3576         }
3577       else if (code == MINUS)
3578         {
3579           val = aarch64_force_temporary (mode, temp1, val);
3580           val = gen_rtx_NEG (mode, val);
3581         }
3582
3583       if (constant == 0 || frame_related_p)
3584         {
3585           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3586           if (frame_related_p)
3587             {
3588               RTX_FRAME_RELATED_P (insn) = true;
3589               add_reg_note (insn, REG_CFA_ADJUST_CFA,
3590                             gen_rtx_SET (dest, plus_constant (Pmode, src,
3591                                                               poly_offset)));
3592             }
3593           src = dest;
3594           if (constant == 0)
3595             return;
3596         }
3597       else
3598         {
3599           src = aarch64_force_temporary (mode, temp1, val);
3600           temp1 = temp2;
3601           temp2 = NULL_RTX;
3602         }
3603
3604       emit_move_imm = true;
3605     }
3606
3607   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3608                         frame_related_p, emit_move_imm);
3609 }
3610
3611 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3612    than a poly_int64.  */
3613
3614 void
3615 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3616                           rtx offset_rtx, rtx temp1, rtx temp2)
3617 {
3618   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3619                       temp1, temp2, false);
3620 }
3621
3622 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3623    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
3624    if TEMP1 already contains abs (DELTA).  */
3625
3626 static inline void
3627 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3628 {
3629   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3630                       temp1, temp2, true, emit_move_imm);
3631 }
3632
3633 /* Subtract DELTA from the stack pointer, marking the instructions
3634    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
3635    if nonnull.  */
3636
3637 static inline void
3638 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3639                 bool emit_move_imm = true)
3640 {
3641   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3642                       temp1, temp2, frame_related_p, emit_move_imm);
3643 }
3644
3645 /* Set DEST to (vec_series BASE STEP).  */
3646
3647 static void
3648 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3649 {
3650   machine_mode mode = GET_MODE (dest);
3651   scalar_mode inner = GET_MODE_INNER (mode);
3652
3653   /* Each operand can be a register or an immediate in the range [-16, 15].  */
3654   if (!aarch64_sve_index_immediate_p (base))
3655     base = force_reg (inner, base);
3656   if (!aarch64_sve_index_immediate_p (step))
3657     step = force_reg (inner, step);
3658
3659   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3660 }
3661
3662 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
3663    register of mode MODE.  Use TARGET for the result if it's nonnull
3664    and convenient.
3665
3666    The two vector modes must have the same element mode.  The behavior
3667    is to duplicate architectural lane N of SRC into architectural lanes
3668    N + I * STEP of the result.  On big-endian targets, architectural
3669    lane 0 of an Advanced SIMD vector is the last element of the vector
3670    in memory layout, so for big-endian targets this operation has the
3671    effect of reversing SRC before duplicating it.  Callers need to
3672    account for this.  */
3673
3674 rtx
3675 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
3676 {
3677   machine_mode src_mode = GET_MODE (src);
3678   gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
3679   insn_code icode = (BYTES_BIG_ENDIAN
3680                      ? code_for_aarch64_vec_duplicate_vq_be (mode)
3681                      : code_for_aarch64_vec_duplicate_vq_le (mode));
3682
3683   unsigned int i = 0;
3684   expand_operand ops[3];
3685   create_output_operand (&ops[i++], target, mode);
3686   create_output_operand (&ops[i++], src, src_mode);
3687   if (BYTES_BIG_ENDIAN)
3688     {
3689       /* Create a PARALLEL describing the reversal of SRC.  */
3690       unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
3691       rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
3692                                                   nelts_per_vq - 1, -1);
3693       create_fixed_operand (&ops[i++], sel);
3694     }
3695   expand_insn (icode, i, ops);
3696   return ops[0].value;
3697 }
3698
3699 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
3700    the memory image into DEST.  Return true on success.  */
3701
3702 static bool
3703 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
3704 {
3705   src = force_const_mem (GET_MODE (src), src);
3706   if (!src)
3707     return false;
3708
3709   /* Make sure that the address is legitimate.  */
3710   if (!aarch64_sve_ld1rq_operand_p (src))
3711     {
3712       rtx addr = force_reg (Pmode, XEXP (src, 0));
3713       src = replace_equiv_address (src, addr);
3714     }
3715
3716   machine_mode mode = GET_MODE (dest);
3717   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3718   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3719   rtx ptrue = aarch64_ptrue_reg (pred_mode);
3720   emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
3721   return true;
3722 }
3723
3724 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
3725    SVE data mode and isn't a legitimate constant.  Use TARGET for the
3726    result if convenient.
3727
3728    The returned register can have whatever mode seems most natural
3729    given the contents of SRC.  */
3730
3731 static rtx
3732 aarch64_expand_sve_const_vector (rtx target, rtx src)
3733 {
3734   machine_mode mode = GET_MODE (src);
3735   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3736   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3737   scalar_mode elt_mode = GET_MODE_INNER (mode);
3738   unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
3739   unsigned int encoded_bits = npatterns * nelts_per_pattern * elt_bits;
3740
3741   if (nelts_per_pattern == 1 && encoded_bits == 128)
3742     {
3743       /* The constant is a duplicated quadword but can't be narrowed
3744          beyond a quadword.  Get the memory image of the first quadword
3745          as a 128-bit vector and try using LD1RQ to load it from memory.
3746
3747          The effect for both endiannesses is to load memory lane N into
3748          architectural lanes N + I * STEP of the result.  On big-endian
3749          targets, the layout of the 128-bit vector in an Advanced SIMD
3750          register would be different from its layout in an SVE register,
3751          but this 128-bit vector is a memory value only.  */
3752       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3753       rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
3754       if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
3755         return target;
3756     }
3757
3758   if (nelts_per_pattern == 1 && encoded_bits < 128)
3759     {
3760       /* The vector is a repeating sequence of 64 bits or fewer.
3761          See if we can load them using an Advanced SIMD move and then
3762          duplicate it to fill a vector.  This is better than using a GPR
3763          move because it keeps everything in the same register file.  */
3764       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3765       rtx_vector_builder builder (vq_mode, npatterns, 1);
3766       for (unsigned int i = 0; i < npatterns; ++i)
3767         {
3768           /* We want memory lane N to go into architectural lane N,
3769              so reverse for big-endian targets.  The DUP .Q pattern
3770              has a compensating reverse built-in.  */
3771           unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
3772           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
3773         }
3774       rtx vq_src = builder.build ();
3775       if (aarch64_simd_valid_immediate (vq_src, NULL))
3776         {
3777           vq_src = force_reg (vq_mode, vq_src);
3778           return aarch64_expand_sve_dupq (target, mode, vq_src);
3779         }
3780
3781       /* Get an integer representation of the repeating part of Advanced
3782          SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
3783          which for big-endian targets is lane-swapped wrt a normal
3784          Advanced SIMD vector.  This means that for both endiannesses,
3785          memory lane N of SVE vector SRC corresponds to architectural
3786          lane N of a register holding VQ_SRC.  This in turn means that
3787          memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
3788          as a single 128-bit value) and thus that memory lane 0 of SRC is
3789          in the lsb of the integer.  Duplicating the integer therefore
3790          ensures that memory lane N of SRC goes into architectural lane
3791          N + I * INDEX of the SVE register.  */
3792       scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
3793       rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
3794       if (elt_value)
3795         {
3796           /* Pretend that we had a vector of INT_MODE to start with.  */
3797           elt_mode = int_mode;
3798           mode = aarch64_full_sve_mode (int_mode).require ();
3799
3800           /* If the integer can be moved into a general register by a
3801              single instruction, do that and duplicate the result.  */
3802           if (CONST_INT_P (elt_value)
3803               && aarch64_move_imm (INTVAL (elt_value), elt_mode))
3804             {
3805               elt_value = force_reg (elt_mode, elt_value);
3806               return expand_vector_broadcast (mode, elt_value);
3807             }
3808         }
3809       else if (npatterns == 1)
3810         /* We're duplicating a single value, but can't do better than
3811            force it to memory and load from there.  This handles things
3812            like symbolic constants.  */
3813         elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
3814
3815       if (elt_value)
3816         {
3817           /* Load the element from memory if we can, otherwise move it into
3818              a register and use a DUP.  */
3819           rtx op = force_const_mem (elt_mode, elt_value);
3820           if (!op)
3821             op = force_reg (elt_mode, elt_value);
3822           return expand_vector_broadcast (mode, op);
3823         }
3824     }
3825
3826   /* Try using INDEX.  */
3827   rtx base, step;
3828   if (const_vec_series_p (src, &base, &step))
3829     {
3830       aarch64_expand_vec_series (target, base, step);
3831       return target;
3832     }
3833
3834   /* From here on, it's better to force the whole constant to memory
3835      if we can.  */
3836   if (GET_MODE_NUNITS (mode).is_constant ())
3837     return NULL_RTX;
3838
3839   /* Expand each pattern individually.  */
3840   gcc_assert (npatterns > 1);
3841   rtx_vector_builder builder;
3842   auto_vec<rtx, 16> vectors (npatterns);
3843   for (unsigned int i = 0; i < npatterns; ++i)
3844     {
3845       builder.new_vector (mode, 1, nelts_per_pattern);
3846       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3847         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3848       vectors.quick_push (force_reg (mode, builder.build ()));
3849     }
3850
3851   /* Use permutes to interleave the separate vectors.  */
3852   while (npatterns > 1)
3853     {
3854       npatterns /= 2;
3855       for (unsigned int i = 0; i < npatterns; ++i)
3856         {
3857           rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
3858           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3859           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3860           vectors[i] = tmp;
3861         }
3862     }
3863   gcc_assert (vectors[0] == target);
3864   return target;
3865 }
3866
3867 /* Use WHILE to set a predicate register of mode MODE in which the first
3868    VL bits are set and the rest are clear.  Use TARGET for the register
3869    if it's nonnull and convenient.  */
3870
3871 static rtx
3872 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
3873                                  unsigned int vl)
3874 {
3875   rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
3876   target = aarch64_target_reg (target, mode);
3877   emit_insn (gen_while_ult (DImode, mode, target, const0_rtx, limit));
3878   return target;
3879 }
3880
3881 static rtx
3882 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
3883
3884 /* BUILDER is a constant predicate in which the index of every set bit
3885    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
3886    by inverting every element at a multiple of ELT_SIZE and EORing the
3887    result with an ELT_SIZE PTRUE.
3888
3889    Return a register that contains the constant on success, otherwise
3890    return null.  Use TARGET as the register if it is nonnull and
3891    convenient.  */
3892
3893 static rtx
3894 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
3895                                    unsigned int elt_size)
3896 {
3897   /* Invert every element at a multiple of ELT_SIZE, keeping the
3898      other bits zero.  */
3899   rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
3900                                   builder.nelts_per_pattern ());
3901   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
3902     if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
3903       inv_builder.quick_push (const1_rtx);
3904     else
3905       inv_builder.quick_push (const0_rtx);
3906   inv_builder.finalize ();
3907
3908   /* See if we can load the constant cheaply.  */
3909   rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
3910   if (!inv)
3911     return NULL_RTX;
3912
3913   /* EOR the result with an ELT_SIZE PTRUE.  */
3914   rtx mask = aarch64_ptrue_all (elt_size);
3915   mask = force_reg (VNx16BImode, mask);
3916   target = aarch64_target_reg (target, VNx16BImode);
3917   emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
3918   return target;
3919 }
3920
3921 /* BUILDER is a constant predicate in which the index of every set bit
3922    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
3923    using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE.  Return the
3924    register on success, otherwise return null.  Use TARGET as the register
3925    if nonnull and convenient.  */
3926
3927 static rtx
3928 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
3929                                    unsigned int elt_size,
3930                                    unsigned int permute_size)
3931 {
3932   /* We're going to split the constant into two new constants A and B,
3933      with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
3934      and into B otherwise.  E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
3935
3936      A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
3937      B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
3938
3939      where _ indicates elements that will be discarded by the permute.
3940
3941      First calculate the ELT_SIZEs for A and B.  */
3942   unsigned int a_elt_size = GET_MODE_SIZE (DImode);
3943   unsigned int b_elt_size = GET_MODE_SIZE (DImode);
3944   for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
3945     if (INTVAL (builder.elt (i)) != 0)
3946       {
3947         if (i & permute_size)
3948           b_elt_size |= i - permute_size;
3949         else
3950           a_elt_size |= i;
3951       }
3952   a_elt_size &= -a_elt_size;
3953   b_elt_size &= -b_elt_size;
3954
3955   /* Now construct the vectors themselves.  */
3956   rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
3957                                 builder.nelts_per_pattern ());
3958   rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
3959                                 builder.nelts_per_pattern ());
3960   unsigned int nelts = builder.encoded_nelts ();
3961   for (unsigned int i = 0; i < nelts; ++i)
3962     if (i & (elt_size - 1))
3963       {
3964         a_builder.quick_push (const0_rtx);
3965         b_builder.quick_push (const0_rtx);
3966       }
3967     else if ((i & permute_size) == 0)
3968       {
3969         /* The A and B elements are significant.  */
3970         a_builder.quick_push (builder.elt (i));
3971         b_builder.quick_push (builder.elt (i + permute_size));
3972       }
3973     else
3974       {
3975         /* The A and B elements are going to be discarded, so pick whatever
3976            is likely to give a nice constant.  We are targeting element
3977            sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
3978            with the aim of each being a sequence of ones followed by
3979            a sequence of zeros.  So:
3980
3981            * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
3982              duplicate the last X_ELT_SIZE element, to extend the
3983              current sequence of ones or zeros.
3984
3985            * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
3986              zero, so that the constant really does have X_ELT_SIZE and
3987              not a smaller size.  */
3988         if (a_elt_size > permute_size)
3989           a_builder.quick_push (const0_rtx);
3990         else
3991           a_builder.quick_push (a_builder.elt (i - a_elt_size));
3992         if (b_elt_size > permute_size)
3993           b_builder.quick_push (const0_rtx);
3994         else
3995           b_builder.quick_push (b_builder.elt (i - b_elt_size));
3996       }
3997   a_builder.finalize ();
3998   b_builder.finalize ();
3999
4000   /* Try loading A into a register.  */
4001   rtx_insn *last = get_last_insn ();
4002   rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
4003   if (!a)
4004     return NULL_RTX;
4005
4006   /* Try loading B into a register.  */
4007   rtx b = a;
4008   if (a_builder != b_builder)
4009     {
4010       b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
4011       if (!b)
4012         {
4013           delete_insns_since (last);
4014           return NULL_RTX;
4015         }
4016     }
4017
4018   /* Emit the TRN1 itself.  */
4019   machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
4020   target = aarch64_target_reg (target, mode);
4021   emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
4022                               gen_lowpart (mode, a),
4023                               gen_lowpart (mode, b)));
4024   return target;
4025 }
4026
4027 /* Subroutine of aarch64_expand_sve_const_pred.  Try to load the VNx16BI
4028    constant in BUILDER into an SVE predicate register.  Return the register
4029    on success, otherwise return null.  Use TARGET for the register if
4030    nonnull and convenient.
4031
4032    ALLOW_RECURSE_P is true if we can use methods that would call this
4033    function recursively.  */
4034
4035 static rtx
4036 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
4037                                  bool allow_recurse_p)
4038 {
4039   if (builder.encoded_nelts () == 1)
4040     /* A PFALSE or a PTRUE .B ALL.  */
4041     return aarch64_emit_set_immediate (target, builder);
4042
4043   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
4044   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
4045     {
4046       /* If we can load the constant using PTRUE, use it as-is.  */
4047       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
4048       if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
4049         return aarch64_emit_set_immediate (target, builder);
4050
4051       /* Otherwise use WHILE to set the first VL bits.  */
4052       return aarch64_sve_move_pred_via_while (target, mode, vl);
4053     }
4054
4055   if (!allow_recurse_p)
4056     return NULL_RTX;
4057
4058   /* Try inverting the vector in element size ELT_SIZE and then EORing
4059      the result with an ELT_SIZE PTRUE.  */
4060   if (INTVAL (builder.elt (0)) == 0)
4061     if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
4062                                                      elt_size))
4063       return res;
4064
4065   /* Try using TRN1 to permute two simpler constants.  */
4066   for (unsigned int i = elt_size; i <= 8; i *= 2)
4067     if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
4068                                                      elt_size, i))
4069       return res;
4070
4071   return NULL_RTX;
4072 }
4073
4074 /* Return an SVE predicate register that contains the VNx16BImode
4075    constant in BUILDER, without going through the move expanders.
4076
4077    The returned register can have whatever mode seems most natural
4078    given the contents of BUILDER.  Use TARGET for the result if
4079    convenient.  */
4080
4081 static rtx
4082 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
4083 {
4084   /* Try loading the constant using pure predicate operations.  */
4085   if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
4086     return res;
4087
4088   /* Try forcing the constant to memory.  */
4089   if (builder.full_nelts ().is_constant ())
4090     if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
4091       {
4092         target = aarch64_target_reg (target, VNx16BImode);
4093         emit_move_insn (target, mem);
4094         return target;
4095       }
4096
4097   /* The last resort is to load the constant as an integer and then
4098      compare it against zero.  Use -1 for set bits in order to increase
4099      the changes of using SVE DUPM or an Advanced SIMD byte mask.  */
4100   rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
4101                                   builder.nelts_per_pattern ());
4102   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4103     int_builder.quick_push (INTVAL (builder.elt (i))
4104                             ? constm1_rtx : const0_rtx);
4105   return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
4106                                            int_builder.build ());
4107 }
4108
4109 /* Set DEST to immediate IMM.  */
4110
4111 void
4112 aarch64_expand_mov_immediate (rtx dest, rtx imm)
4113 {
4114   machine_mode mode = GET_MODE (dest);
4115
4116   /* Check on what type of symbol it is.  */
4117   scalar_int_mode int_mode;
4118   if ((GET_CODE (imm) == SYMBOL_REF
4119        || GET_CODE (imm) == LABEL_REF
4120        || GET_CODE (imm) == CONST
4121        || GET_CODE (imm) == CONST_POLY_INT)
4122       && is_a <scalar_int_mode> (mode, &int_mode))
4123     {
4124       rtx mem;
4125       poly_int64 offset;
4126       HOST_WIDE_INT const_offset;
4127       enum aarch64_symbol_type sty;
4128
4129       /* If we have (const (plus symbol offset)), separate out the offset
4130          before we start classifying the symbol.  */
4131       rtx base = strip_offset (imm, &offset);
4132
4133       /* We must always add an offset involving VL separately, rather than
4134          folding it into the relocation.  */
4135       if (!offset.is_constant (&const_offset))
4136         {
4137           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
4138             emit_insn (gen_rtx_SET (dest, imm));
4139           else
4140             {
4141               /* Do arithmetic on 32-bit values if the result is smaller
4142                  than that.  */
4143               if (partial_subreg_p (int_mode, SImode))
4144                 {
4145                   /* It is invalid to do symbol calculations in modes
4146                      narrower than SImode.  */
4147                   gcc_assert (base == const0_rtx);
4148                   dest = gen_lowpart (SImode, dest);
4149                   int_mode = SImode;
4150                 }
4151               if (base != const0_rtx)
4152                 {
4153                   base = aarch64_force_temporary (int_mode, dest, base);
4154                   aarch64_add_offset (int_mode, dest, base, offset,
4155                                       NULL_RTX, NULL_RTX, false);
4156                 }
4157               else
4158                 aarch64_add_offset (int_mode, dest, base, offset,
4159                                     dest, NULL_RTX, false);
4160             }
4161           return;
4162         }
4163
4164       sty = aarch64_classify_symbol (base, const_offset);
4165       switch (sty)
4166         {
4167         case SYMBOL_FORCE_TO_MEM:
4168           if (const_offset != 0
4169               && targetm.cannot_force_const_mem (int_mode, imm))
4170             {
4171               gcc_assert (can_create_pseudo_p ());
4172               base = aarch64_force_temporary (int_mode, dest, base);
4173               aarch64_add_offset (int_mode, dest, base, const_offset,
4174                                   NULL_RTX, NULL_RTX, false);
4175               return;
4176             }
4177
4178           mem = force_const_mem (ptr_mode, imm);
4179           gcc_assert (mem);
4180
4181           /* If we aren't generating PC relative literals, then
4182              we need to expand the literal pool access carefully.
4183              This is something that needs to be done in a number
4184              of places, so could well live as a separate function.  */
4185           if (!aarch64_pcrelative_literal_loads)
4186             {
4187               gcc_assert (can_create_pseudo_p ());
4188               base = gen_reg_rtx (ptr_mode);
4189               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
4190               if (ptr_mode != Pmode)
4191                 base = convert_memory_address (Pmode, base);
4192               mem = gen_rtx_MEM (ptr_mode, base);
4193             }
4194
4195           if (int_mode != ptr_mode)
4196             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
4197
4198           emit_insn (gen_rtx_SET (dest, mem));
4199
4200           return;
4201
4202         case SYMBOL_SMALL_TLSGD:
4203         case SYMBOL_SMALL_TLSDESC:
4204         case SYMBOL_SMALL_TLSIE:
4205         case SYMBOL_SMALL_GOT_28K:
4206         case SYMBOL_SMALL_GOT_4G:
4207         case SYMBOL_TINY_GOT:
4208         case SYMBOL_TINY_TLSIE:
4209           if (const_offset != 0)
4210             {
4211               gcc_assert(can_create_pseudo_p ());
4212               base = aarch64_force_temporary (int_mode, dest, base);
4213               aarch64_add_offset (int_mode, dest, base, const_offset,
4214                                   NULL_RTX, NULL_RTX, false);
4215               return;
4216             }
4217           /* FALLTHRU */
4218
4219         case SYMBOL_SMALL_ABSOLUTE:
4220         case SYMBOL_TINY_ABSOLUTE:
4221         case SYMBOL_TLSLE12:
4222         case SYMBOL_TLSLE24:
4223         case SYMBOL_TLSLE32:
4224         case SYMBOL_TLSLE48:
4225           aarch64_load_symref_appropriately (dest, imm, sty);
4226           return;
4227
4228         default:
4229           gcc_unreachable ();
4230         }
4231     }
4232
4233   if (!CONST_INT_P (imm))
4234     {
4235       if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
4236         {
4237           /* Only the low bit of each .H, .S and .D element is defined,
4238              so we can set the upper bits to whatever we like.  If the
4239              predicate is all-true in MODE, prefer to set all the undefined
4240              bits as well, so that we can share a single .B predicate for
4241              all modes.  */
4242           if (imm == CONSTM1_RTX (mode))
4243             imm = CONSTM1_RTX (VNx16BImode);
4244
4245           /* All methods for constructing predicate modes wider than VNx16BI
4246              will set the upper bits of each element to zero.  Expose this
4247              by moving such constants as a VNx16BI, so that all bits are
4248              significant and so that constants for different modes can be
4249              shared.  The wider constant will still be available as a
4250              REG_EQUAL note.  */
4251           rtx_vector_builder builder;
4252           if (aarch64_get_sve_pred_bits (builder, imm))
4253             {
4254               rtx res = aarch64_expand_sve_const_pred (dest, builder);
4255               if (dest != res)
4256                 emit_move_insn (dest, gen_lowpart (mode, res));
4257               return;
4258             }
4259         }
4260
4261       if (GET_CODE (imm) == HIGH
4262           || aarch64_simd_valid_immediate (imm, NULL))
4263         {
4264           emit_insn (gen_rtx_SET (dest, imm));
4265           return;
4266         }
4267
4268       if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
4269         if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
4270           {
4271             if (dest != res)
4272               emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
4273             return;
4274           }
4275
4276       rtx mem = force_const_mem (mode, imm);
4277       gcc_assert (mem);
4278       emit_move_insn (dest, mem);
4279       return;
4280     }
4281
4282   aarch64_internal_mov_immediate (dest, imm, true,
4283                                   as_a <scalar_int_mode> (mode));
4284 }
4285
4286 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
4287    that is known to contain PTRUE.  */
4288
4289 void
4290 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
4291 {
4292   expand_operand ops[3];
4293   machine_mode mode = GET_MODE (dest);
4294   create_output_operand (&ops[0], dest, mode);
4295   create_input_operand (&ops[1], pred, GET_MODE(pred));
4296   create_input_operand (&ops[2], src, mode);
4297   temporary_volatile_ok v (true);
4298   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
4299 }
4300
4301 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
4302    operand is in memory.  In this case we need to use the predicated LD1
4303    and ST1 instead of LDR and STR, both for correctness on big-endian
4304    targets and because LD1 and ST1 support a wider range of addressing modes.
4305    PRED_MODE is the mode of the predicate.
4306
4307    See the comment at the head of aarch64-sve.md for details about the
4308    big-endian handling.  */
4309
4310 void
4311 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
4312 {
4313   machine_mode mode = GET_MODE (dest);
4314   rtx ptrue = aarch64_ptrue_reg (pred_mode);
4315   if (!register_operand (src, mode)
4316       && !register_operand (dest, mode))
4317     {
4318       rtx tmp = gen_reg_rtx (mode);
4319       if (MEM_P (src))
4320         aarch64_emit_sve_pred_move (tmp, ptrue, src);
4321       else
4322         emit_move_insn (tmp, src);
4323       src = tmp;
4324     }
4325   aarch64_emit_sve_pred_move (dest, ptrue, src);
4326 }
4327
4328 /* Called only on big-endian targets.  See whether an SVE vector move
4329    from SRC to DEST is effectively a REV[BHW] instruction, because at
4330    least one operand is a subreg of an SVE vector that has wider or
4331    narrower elements.  Return true and emit the instruction if so.
4332
4333    For example:
4334
4335      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4336
4337    represents a VIEW_CONVERT between the following vectors, viewed
4338    in memory order:
4339
4340      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
4341      R1: { [0],      [1],      [2],      [3],     ... }
4342
4343    The high part of lane X in R2 should therefore correspond to lane X*2
4344    of R1, but the register representations are:
4345
4346          msb                                      lsb
4347      R2: ...... [1].high  [1].low   [0].high  [0].low
4348      R1: ...... [3]       [2]       [1]       [0]
4349
4350    where the low part of lane X in R2 corresponds to lane X*2 in R1.
4351    We therefore need a reverse operation to swap the high and low values
4352    around.
4353
4354    This is purely an optimization.  Without it we would spill the
4355    subreg operand to the stack in one mode and reload it in the
4356    other mode, which has the same effect as the REV.  */
4357
4358 bool
4359 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
4360 {
4361   gcc_assert (BYTES_BIG_ENDIAN);
4362   if (GET_CODE (dest) == SUBREG)
4363     dest = SUBREG_REG (dest);
4364   if (GET_CODE (src) == SUBREG)
4365     src = SUBREG_REG (src);
4366
4367   /* The optimization handles two single SVE REGs with different element
4368      sizes.  */
4369   if (!REG_P (dest)
4370       || !REG_P (src)
4371       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
4372       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
4373       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
4374           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
4375     return false;
4376
4377   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
4378   rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
4379   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
4380                                UNSPEC_REV_SUBREG);
4381   emit_insn (gen_rtx_SET (dest, unspec));
4382   return true;
4383 }
4384
4385 /* Return a copy of X with mode MODE, without changing its other
4386    attributes.  Unlike gen_lowpart, this doesn't care whether the
4387    mode change is valid.  */
4388
4389 static rtx
4390 aarch64_replace_reg_mode (rtx x, machine_mode mode)
4391 {
4392   if (GET_MODE (x) == mode)
4393     return x;
4394
4395   x = shallow_copy_rtx (x);
4396   set_mode_and_regno (x, mode, REGNO (x));
4397   return x;
4398 }
4399
4400 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
4401    stored in wider integer containers.  */
4402
4403 static unsigned int
4404 aarch64_sve_rev_unspec (machine_mode mode)
4405 {
4406   switch (GET_MODE_UNIT_SIZE (mode))
4407     {
4408     case 1: return UNSPEC_REVB;
4409     case 2: return UNSPEC_REVH;
4410     case 4: return UNSPEC_REVW;
4411     }
4412   gcc_unreachable ();
4413 }
4414
4415 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4416    operands.  */
4417
4418 void
4419 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
4420 {
4421   /* Decide which REV operation we need.  The mode with wider elements
4422      determines the mode of the operands and the mode with the narrower
4423      elements determines the reverse width.  */
4424   machine_mode mode_with_wider_elts = GET_MODE (dest);
4425   machine_mode mode_with_narrower_elts = GET_MODE (src);
4426   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
4427       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
4428     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
4429
4430   unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
4431   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
4432   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
4433
4434   /* Get the operands in the appropriate modes and emit the instruction.  */
4435   ptrue = gen_lowpart (pred_mode, ptrue);
4436   dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
4437   src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
4438   emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
4439                                dest, ptrue, src));
4440 }
4441
4442 static bool
4443 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
4444                                  tree exp ATTRIBUTE_UNUSED)
4445 {
4446   if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
4447     return false;
4448
4449   return true;
4450 }
4451
4452 /* Implement TARGET_PASS_BY_REFERENCE.  */
4453
4454 static bool
4455 aarch64_pass_by_reference (cumulative_args_t, const function_arg_info &arg)
4456 {
4457   HOST_WIDE_INT size;
4458   machine_mode dummymode;
4459   int nregs;
4460
4461   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
4462   if (arg.mode == BLKmode && arg.type)
4463     size = int_size_in_bytes (arg.type);
4464   else
4465     /* No frontends can create types with variable-sized modes, so we
4466        shouldn't be asked to pass or return them.  */
4467     size = GET_MODE_SIZE (arg.mode).to_constant ();
4468
4469   /* Aggregates are passed by reference based on their size.  */
4470   if (arg.aggregate_type_p ())
4471     size = int_size_in_bytes (arg.type);
4472
4473   /* Variable sized arguments are always returned by reference.  */
4474   if (size < 0)
4475     return true;
4476
4477   /* Can this be a candidate to be passed in fp/simd register(s)?  */
4478   if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
4479                                                &dummymode, &nregs,
4480                                                NULL))
4481     return false;
4482
4483   /* Arguments which are variable sized or larger than 2 registers are
4484      passed by reference unless they are a homogenous floating point
4485      aggregate.  */
4486   return size > 2 * UNITS_PER_WORD;
4487 }
4488
4489 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
4490 static bool
4491 aarch64_return_in_msb (const_tree valtype)
4492 {
4493   machine_mode dummy_mode;
4494   int dummy_int;
4495
4496   /* Never happens in little-endian mode.  */
4497   if (!BYTES_BIG_ENDIAN)
4498     return false;
4499
4500   /* Only composite types smaller than or equal to 16 bytes can
4501      be potentially returned in registers.  */
4502   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
4503       || int_size_in_bytes (valtype) <= 0
4504       || int_size_in_bytes (valtype) > 16)
4505     return false;
4506
4507   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4508      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4509      is always passed/returned in the least significant bits of fp/simd
4510      register(s).  */
4511   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
4512                                                &dummy_mode, &dummy_int, NULL))
4513     return false;
4514
4515   return true;
4516 }
4517
4518 /* Implement TARGET_FUNCTION_VALUE.
4519    Define how to find the value returned by a function.  */
4520
4521 static rtx
4522 aarch64_function_value (const_tree type, const_tree func,
4523                         bool outgoing ATTRIBUTE_UNUSED)
4524 {
4525   machine_mode mode;
4526   int unsignedp;
4527   int count;
4528   machine_mode ag_mode;
4529
4530   mode = TYPE_MODE (type);
4531   if (INTEGRAL_TYPE_P (type))
4532     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
4533
4534   if (aarch64_return_in_msb (type))
4535     {
4536       HOST_WIDE_INT size = int_size_in_bytes (type);
4537
4538       if (size % UNITS_PER_WORD != 0)
4539         {
4540           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
4541           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
4542         }
4543     }
4544
4545   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4546                                                &ag_mode, &count, NULL))
4547     {
4548       if (!aarch64_composite_type_p (type, mode))
4549         {
4550           gcc_assert (count == 1 && mode == ag_mode);
4551           return gen_rtx_REG (mode, V0_REGNUM);
4552         }
4553       else
4554         {
4555           int i;
4556           rtx par;
4557
4558           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
4559           for (i = 0; i < count; i++)
4560             {
4561               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
4562               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
4563               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4564               XVECEXP (par, 0, i) = tmp;
4565             }
4566           return par;
4567         }
4568     }
4569   else
4570     return gen_rtx_REG (mode, R0_REGNUM);
4571 }
4572
4573 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
4574    Return true if REGNO is the number of a hard register in which the values
4575    of called function may come back.  */
4576
4577 static bool
4578 aarch64_function_value_regno_p (const unsigned int regno)
4579 {
4580   /* Maximum of 16 bytes can be returned in the general registers.  Examples
4581      of 16-byte return values are: 128-bit integers and 16-byte small
4582      structures (excluding homogeneous floating-point aggregates).  */
4583   if (regno == R0_REGNUM || regno == R1_REGNUM)
4584     return true;
4585
4586   /* Up to four fp/simd registers can return a function value, e.g. a
4587      homogeneous floating-point aggregate having four members.  */
4588   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
4589     return TARGET_FLOAT;
4590
4591   return false;
4592 }
4593
4594 /* Implement TARGET_RETURN_IN_MEMORY.
4595
4596    If the type T of the result of a function is such that
4597      void func (T arg)
4598    would require that arg be passed as a value in a register (or set of
4599    registers) according to the parameter passing rules, then the result
4600    is returned in the same registers as would be used for such an
4601    argument.  */
4602
4603 static bool
4604 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
4605 {
4606   HOST_WIDE_INT size;
4607   machine_mode ag_mode;
4608   int count;
4609
4610   if (!AGGREGATE_TYPE_P (type)
4611       && TREE_CODE (type) != COMPLEX_TYPE
4612       && TREE_CODE (type) != VECTOR_TYPE)
4613     /* Simple scalar types always returned in registers.  */
4614     return false;
4615
4616   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
4617                                                type,
4618                                                &ag_mode,
4619                                                &count,
4620                                                NULL))
4621     return false;
4622
4623   /* Types larger than 2 registers returned in memory.  */
4624   size = int_size_in_bytes (type);
4625   return (size < 0 || size > 2 * UNITS_PER_WORD);
4626 }
4627
4628 static bool
4629 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
4630                                const_tree type, int *nregs)
4631 {
4632   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4633   return aarch64_vfp_is_call_or_return_candidate (mode,
4634                                                   type,
4635                                                   &pcum->aapcs_vfp_rmode,
4636                                                   nregs,
4637                                                   NULL);
4638 }
4639
4640 /* Given MODE and TYPE of a function argument, return the alignment in
4641    bits.  The idea is to suppress any stronger alignment requested by
4642    the user and opt for the natural alignment (specified in AAPCS64 \S
4643    4.1).  ABI_BREAK is set to true if the alignment was incorrectly
4644    calculated in versions of GCC prior to GCC-9.  This is a helper
4645    function for local use only.  */
4646
4647 static unsigned int
4648 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
4649                                 bool *abi_break)
4650 {
4651   *abi_break = false;
4652   if (!type)
4653     return GET_MODE_ALIGNMENT (mode);
4654
4655   if (integer_zerop (TYPE_SIZE (type)))
4656     return 0;
4657
4658   gcc_assert (TYPE_MODE (type) == mode);
4659
4660   if (!AGGREGATE_TYPE_P (type))
4661     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
4662
4663   if (TREE_CODE (type) == ARRAY_TYPE)
4664     return TYPE_ALIGN (TREE_TYPE (type));
4665
4666   unsigned int alignment = 0;
4667   unsigned int bitfield_alignment = 0;
4668   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
4669     if (TREE_CODE (field) == FIELD_DECL)
4670       {
4671         alignment = std::max (alignment, DECL_ALIGN (field));
4672         if (DECL_BIT_FIELD_TYPE (field))
4673           bitfield_alignment
4674             = std::max (bitfield_alignment,
4675                         TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
4676       }
4677
4678   if (bitfield_alignment > alignment)
4679     {
4680       *abi_break = true;
4681       return bitfield_alignment;
4682     }
4683
4684   return alignment;
4685 }
4686
4687 /* Layout a function argument according to the AAPCS64 rules.  The rule
4688    numbers refer to the rule numbers in the AAPCS64.  */
4689
4690 static void
4691 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
4692                     const_tree type,
4693                     bool named ATTRIBUTE_UNUSED)
4694 {
4695   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4696   int ncrn, nvrn, nregs;
4697   bool allocate_ncrn, allocate_nvrn;
4698   HOST_WIDE_INT size;
4699   bool abi_break;
4700
4701   /* We need to do this once per argument.  */
4702   if (pcum->aapcs_arg_processed)
4703     return;
4704
4705   pcum->aapcs_arg_processed = true;
4706
4707   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
4708   if (type)
4709     size = int_size_in_bytes (type);
4710   else
4711     /* No frontends can create types with variable-sized modes, so we
4712        shouldn't be asked to pass or return them.  */
4713     size = GET_MODE_SIZE (mode).to_constant ();
4714   size = ROUND_UP (size, UNITS_PER_WORD);
4715
4716   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
4717   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
4718                                                  mode,
4719                                                  type,
4720                                                  &nregs);
4721
4722   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
4723      The following code thus handles passing by SIMD/FP registers first.  */
4724
4725   nvrn = pcum->aapcs_nvrn;
4726
4727   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
4728      and homogenous short-vector aggregates (HVA).  */
4729   if (allocate_nvrn)
4730     {
4731       if (!TARGET_FLOAT)
4732         aarch64_err_no_fpadvsimd (mode);
4733
4734       if (nvrn + nregs <= NUM_FP_ARG_REGS)
4735         {
4736           pcum->aapcs_nextnvrn = nvrn + nregs;
4737           if (!aarch64_composite_type_p (type, mode))
4738             {
4739               gcc_assert (nregs == 1);
4740               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
4741             }
4742           else
4743             {
4744               rtx par;
4745               int i;
4746               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4747               for (i = 0; i < nregs; i++)
4748                 {
4749                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
4750                                          V0_REGNUM + nvrn + i);
4751                   rtx offset = gen_int_mode
4752                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
4753                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4754                   XVECEXP (par, 0, i) = tmp;
4755                 }
4756               pcum->aapcs_reg = par;
4757             }
4758           return;
4759         }
4760       else
4761         {
4762           /* C.3 NSRN is set to 8.  */
4763           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
4764           goto on_stack;
4765         }
4766     }
4767
4768   ncrn = pcum->aapcs_ncrn;
4769   nregs = size / UNITS_PER_WORD;
4770
4771   /* C6 - C9.  though the sign and zero extension semantics are
4772      handled elsewhere.  This is the case where the argument fits
4773      entirely general registers.  */
4774   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
4775     {
4776       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
4777
4778       /* C.8 if the argument has an alignment of 16 then the NGRN is
4779          rounded up to the next even number.  */
4780       if (nregs == 2
4781           && ncrn % 2
4782           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
4783              comparison is there because for > 16 * BITS_PER_UNIT
4784              alignment nregs should be > 2 and therefore it should be
4785              passed by reference rather than value.  */
4786           && (aarch64_function_arg_alignment (mode, type, &abi_break)
4787               == 16 * BITS_PER_UNIT))
4788         {
4789           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4790             inform (input_location, "parameter passing for argument of type "
4791                     "%qT changed in GCC 9.1", type);
4792           ++ncrn;
4793           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
4794         }
4795
4796       /* NREGS can be 0 when e.g. an empty structure is to be passed.
4797          A reg is still generated for it, but the caller should be smart
4798          enough not to use it.  */
4799       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
4800         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
4801       else
4802         {
4803           rtx par;
4804           int i;
4805
4806           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4807           for (i = 0; i < nregs; i++)
4808             {
4809               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
4810               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
4811                                        GEN_INT (i * UNITS_PER_WORD));
4812               XVECEXP (par, 0, i) = tmp;
4813             }
4814           pcum->aapcs_reg = par;
4815         }
4816
4817       pcum->aapcs_nextncrn = ncrn + nregs;
4818       return;
4819     }
4820
4821   /* C.11  */
4822   pcum->aapcs_nextncrn = NUM_ARG_REGS;
4823
4824   /* The argument is passed on stack; record the needed number of words for
4825      this argument and align the total size if necessary.  */
4826 on_stack:
4827   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
4828
4829   if (aarch64_function_arg_alignment (mode, type, &abi_break)
4830       == 16 * BITS_PER_UNIT)
4831     {
4832       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
4833       if (pcum->aapcs_stack_size != new_size)
4834         {
4835           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4836             inform (input_location, "parameter passing for argument of type "
4837                     "%qT changed in GCC 9.1", type);
4838           pcum->aapcs_stack_size = new_size;
4839         }
4840     }
4841   return;
4842 }
4843
4844 /* Implement TARGET_FUNCTION_ARG.  */
4845
4846 static rtx
4847 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
4848 {
4849   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4850   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
4851
4852   if (arg.end_marker_p ())
4853     return NULL_RTX;
4854
4855   aarch64_layout_arg (pcum_v, arg.mode, arg.type, arg.named);
4856   return pcum->aapcs_reg;
4857 }
4858
4859 void
4860 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
4861                            const_tree fntype ATTRIBUTE_UNUSED,
4862                            rtx libname ATTRIBUTE_UNUSED,
4863                            const_tree fndecl ATTRIBUTE_UNUSED,
4864                            unsigned n_named ATTRIBUTE_UNUSED)
4865 {
4866   pcum->aapcs_ncrn = 0;
4867   pcum->aapcs_nvrn = 0;
4868   pcum->aapcs_nextncrn = 0;
4869   pcum->aapcs_nextnvrn = 0;
4870   pcum->pcs_variant = ARM_PCS_AAPCS64;
4871   pcum->aapcs_reg = NULL_RTX;
4872   pcum->aapcs_arg_processed = false;
4873   pcum->aapcs_stack_words = 0;
4874   pcum->aapcs_stack_size = 0;
4875
4876   if (!TARGET_FLOAT
4877       && fndecl && TREE_PUBLIC (fndecl)
4878       && fntype && fntype != error_mark_node)
4879     {
4880       const_tree type = TREE_TYPE (fntype);
4881       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
4882       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
4883       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
4884                                                    &mode, &nregs, NULL))
4885         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
4886     }
4887   return;
4888 }
4889
4890 static void
4891 aarch64_function_arg_advance (cumulative_args_t pcum_v,
4892                               const function_arg_info &arg)
4893 {
4894   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4895   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
4896     {
4897       aarch64_layout_arg (pcum_v, arg.mode, arg.type, arg.named);
4898       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
4899                   != (pcum->aapcs_stack_words != 0));
4900       pcum->aapcs_arg_processed = false;
4901       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4902       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4903       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4904       pcum->aapcs_stack_words = 0;
4905       pcum->aapcs_reg = NULL_RTX;
4906     }
4907 }
4908
4909 bool
4910 aarch64_function_arg_regno_p (unsigned regno)
4911 {
4912   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4913           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4914 }
4915
4916 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
4917    PARM_BOUNDARY bits of alignment, but will be given anything up
4918    to STACK_BOUNDARY bits if the type requires it.  This makes sure
4919    that both before and after the layout of each argument, the Next
4920    Stacked Argument Address (NSAA) will have a minimum alignment of
4921    8 bytes.  */
4922
4923 static unsigned int
4924 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4925 {
4926   bool abi_break;
4927   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
4928                                                            &abi_break);
4929   if (abi_break & warn_psabi)
4930     inform (input_location, "parameter passing for argument of type "
4931             "%qT changed in GCC 9.1", type);
4932
4933   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4934 }
4935
4936 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
4937
4938 static fixed_size_mode
4939 aarch64_get_reg_raw_mode (int regno)
4940 {
4941   if (TARGET_SVE && FP_REGNUM_P (regno))
4942     /* Don't use the SVE part of the register for __builtin_apply and
4943        __builtin_return.  The SVE registers aren't used by the normal PCS,
4944        so using them there would be a waste of time.  The PCS extensions
4945        for SVE types are fundamentally incompatible with the
4946        __builtin_return/__builtin_apply interface.  */
4947     return as_a <fixed_size_mode> (V16QImode);
4948   return default_get_reg_raw_mode (regno);
4949 }
4950
4951 /* Implement TARGET_FUNCTION_ARG_PADDING.
4952
4953    Small aggregate types are placed in the lowest memory address.
4954
4955    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
4956
4957 static pad_direction
4958 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4959 {
4960   /* On little-endian targets, the least significant byte of every stack
4961      argument is passed at the lowest byte address of the stack slot.  */
4962   if (!BYTES_BIG_ENDIAN)
4963     return PAD_UPWARD;
4964
4965   /* Otherwise, integral, floating-point and pointer types are padded downward:
4966      the least significant byte of a stack argument is passed at the highest
4967      byte address of the stack slot.  */
4968   if (type
4969       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4970          || POINTER_TYPE_P (type))
4971       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4972     return PAD_DOWNWARD;
4973
4974   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
4975   return PAD_UPWARD;
4976 }
4977
4978 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4979
4980    It specifies padding for the last (may also be the only)
4981    element of a block move between registers and memory.  If
4982    assuming the block is in the memory, padding upward means that
4983    the last element is padded after its highest significant byte,
4984    while in downward padding, the last element is padded at the
4985    its least significant byte side.
4986
4987    Small aggregates and small complex types are always padded
4988    upwards.
4989
4990    We don't need to worry about homogeneous floating-point or
4991    short-vector aggregates; their move is not affected by the
4992    padding direction determined here.  Regardless of endianness,
4993    each element of such an aggregate is put in the least
4994    significant bits of a fp/simd register.
4995
4996    Return !BYTES_BIG_ENDIAN if the least significant byte of the
4997    register has useful data, and return the opposite if the most
4998    significant byte does.  */
4999
5000 bool
5001 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
5002                      bool first ATTRIBUTE_UNUSED)
5003 {
5004
5005   /* Small composite types are always padded upward.  */
5006   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
5007     {
5008       HOST_WIDE_INT size;
5009       if (type)
5010         size = int_size_in_bytes (type);
5011       else
5012         /* No frontends can create types with variable-sized modes, so we
5013            shouldn't be asked to pass or return them.  */
5014         size = GET_MODE_SIZE (mode).to_constant ();
5015       if (size < 2 * UNITS_PER_WORD)
5016         return true;
5017     }
5018
5019   /* Otherwise, use the default padding.  */
5020   return !BYTES_BIG_ENDIAN;
5021 }
5022
5023 static scalar_int_mode
5024 aarch64_libgcc_cmp_return_mode (void)
5025 {
5026   return SImode;
5027 }
5028
5029 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
5030
5031 /* We use the 12-bit shifted immediate arithmetic instructions so values
5032    must be multiple of (1 << 12), i.e. 4096.  */
5033 #define ARITH_FACTOR 4096
5034
5035 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
5036 #error Cannot use simple address calculation for stack probing
5037 #endif
5038
5039 /* The pair of scratch registers used for stack probing.  */
5040 #define PROBE_STACK_FIRST_REG  R9_REGNUM
5041 #define PROBE_STACK_SECOND_REG R10_REGNUM
5042
5043 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
5044    inclusive.  These are offsets from the current stack pointer.  */
5045
5046 static void
5047 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
5048 {
5049   HOST_WIDE_INT size;
5050   if (!poly_size.is_constant (&size))
5051     {
5052       sorry ("stack probes for SVE frames");
5053       return;
5054     }
5055
5056   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
5057
5058   /* See the same assertion on PROBE_INTERVAL above.  */
5059   gcc_assert ((first % ARITH_FACTOR) == 0);
5060
5061   /* See if we have a constant small number of probes to generate.  If so,
5062      that's the easy case.  */
5063   if (size <= PROBE_INTERVAL)
5064     {
5065       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
5066
5067       emit_set_insn (reg1,
5068                      plus_constant (Pmode,
5069                                     stack_pointer_rtx, -(first + base)));
5070       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
5071     }
5072
5073   /* The run-time loop is made up of 8 insns in the generic case while the
5074      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
5075   else if (size <= 4 * PROBE_INTERVAL)
5076     {
5077       HOST_WIDE_INT i, rem;
5078
5079       emit_set_insn (reg1,
5080                      plus_constant (Pmode,
5081                                     stack_pointer_rtx,
5082                                     -(first + PROBE_INTERVAL)));
5083       emit_stack_probe (reg1);
5084
5085       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
5086          it exceeds SIZE.  If only two probes are needed, this will not
5087          generate any code.  Then probe at FIRST + SIZE.  */
5088       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
5089         {
5090           emit_set_insn (reg1,
5091                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
5092           emit_stack_probe (reg1);
5093         }
5094
5095       rem = size - (i - PROBE_INTERVAL);
5096       if (rem > 256)
5097         {
5098           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5099
5100           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
5101           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
5102         }
5103       else
5104         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
5105     }
5106
5107   /* Otherwise, do the same as above, but in a loop.  Note that we must be
5108      extra careful with variables wrapping around because we might be at
5109      the very top (or the very bottom) of the address space and we have
5110      to be able to handle this case properly; in particular, we use an
5111      equality test for the loop condition.  */
5112   else
5113     {
5114       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
5115
5116       /* Step 1: round SIZE to the previous multiple of the interval.  */
5117
5118       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
5119
5120
5121       /* Step 2: compute initial and final value of the loop counter.  */
5122
5123       /* TEST_ADDR = SP + FIRST.  */
5124       emit_set_insn (reg1,
5125                      plus_constant (Pmode, stack_pointer_rtx, -first));
5126
5127       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
5128       HOST_WIDE_INT adjustment = - (first + rounded_size);
5129       if (! aarch64_uimm12_shift (adjustment))
5130         {
5131           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
5132                                           true, Pmode);
5133           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
5134         }
5135       else
5136         emit_set_insn (reg2,
5137                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
5138
5139       /* Step 3: the loop
5140
5141          do
5142            {
5143              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
5144              probe at TEST_ADDR
5145            }
5146          while (TEST_ADDR != LAST_ADDR)
5147
5148          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
5149          until it is equal to ROUNDED_SIZE.  */
5150
5151       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
5152
5153
5154       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
5155          that SIZE is equal to ROUNDED_SIZE.  */
5156
5157       if (size != rounded_size)
5158         {
5159           HOST_WIDE_INT rem = size - rounded_size;
5160
5161           if (rem > 256)
5162             {
5163               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5164
5165               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
5166               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
5167             }
5168           else
5169             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
5170         }
5171     }
5172
5173   /* Make sure nothing is scheduled before we are done.  */
5174   emit_insn (gen_blockage ());
5175 }
5176
5177 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
5178    absolute addresses.  */
5179
5180 const char *
5181 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
5182 {
5183   static int labelno = 0;
5184   char loop_lab[32];
5185   rtx xops[2];
5186
5187   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
5188
5189   /* Loop.  */
5190   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
5191
5192   HOST_WIDE_INT stack_clash_probe_interval
5193     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5194
5195   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
5196   xops[0] = reg1;
5197   HOST_WIDE_INT interval;
5198   if (flag_stack_clash_protection)
5199     interval = stack_clash_probe_interval;
5200   else
5201     interval = PROBE_INTERVAL;
5202
5203   gcc_assert (aarch64_uimm12_shift (interval));
5204   xops[1] = GEN_INT (interval);
5205
5206   output_asm_insn ("sub\t%0, %0, %1", xops);
5207
5208   /* If doing stack clash protection then we probe up by the ABI specified
5209      amount.  We do this because we're dropping full pages at a time in the
5210      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
5211   if (flag_stack_clash_protection)
5212     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
5213   else
5214     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
5215
5216   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
5217      by this amount for each iteration.  */
5218   output_asm_insn ("str\txzr, [%0, %1]", xops);
5219
5220   /* Test if TEST_ADDR == LAST_ADDR.  */
5221   xops[1] = reg2;
5222   output_asm_insn ("cmp\t%0, %1", xops);
5223
5224   /* Branch.  */
5225   fputs ("\tb.ne\t", asm_out_file);
5226   assemble_name_raw (asm_out_file, loop_lab);
5227   fputc ('\n', asm_out_file);
5228
5229   return "";
5230 }
5231
5232 /* Emit the probe loop for doing stack clash probes and stack adjustments for
5233    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
5234    of GUARD_SIZE.  When a probe is emitted it is done at most
5235    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
5236    at most MIN_PROBE_THRESHOLD.  By the end of this function
5237    BASE = BASE - ADJUSTMENT.  */
5238
5239 const char *
5240 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
5241                                       rtx min_probe_threshold, rtx guard_size)
5242 {
5243   /* This function is not allowed to use any instruction generation function
5244      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
5245      so instead emit the code you want using output_asm_insn.  */
5246   gcc_assert (flag_stack_clash_protection);
5247   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
5248   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
5249
5250   /* The minimum required allocation before the residual requires probing.  */
5251   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
5252
5253   /* Clamp the value down to the nearest value that can be used with a cmp.  */
5254   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
5255   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
5256
5257   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
5258   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
5259
5260   static int labelno = 0;
5261   char loop_start_lab[32];
5262   char loop_end_lab[32];
5263   rtx xops[2];
5264
5265   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
5266   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
5267
5268   /* Emit loop start label.  */
5269   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
5270
5271   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
5272   xops[0] = adjustment;
5273   xops[1] = probe_offset_value_rtx;
5274   output_asm_insn ("cmp\t%0, %1", xops);
5275
5276   /* Branch to end if not enough adjustment to probe.  */
5277   fputs ("\tb.lt\t", asm_out_file);
5278   assemble_name_raw (asm_out_file, loop_end_lab);
5279   fputc ('\n', asm_out_file);
5280
5281   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
5282   xops[0] = base;
5283   xops[1] = probe_offset_value_rtx;
5284   output_asm_insn ("sub\t%0, %0, %1", xops);
5285
5286   /* Probe at BASE.  */
5287   xops[1] = const0_rtx;
5288   output_asm_insn ("str\txzr, [%0, %1]", xops);
5289
5290   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
5291   xops[0] = adjustment;
5292   xops[1] = probe_offset_value_rtx;
5293   output_asm_insn ("sub\t%0, %0, %1", xops);
5294
5295   /* Branch to start if still more bytes to allocate.  */
5296   fputs ("\tb\t", asm_out_file);
5297   assemble_name_raw (asm_out_file, loop_start_lab);
5298   fputc ('\n', asm_out_file);
5299
5300   /* No probe leave.  */
5301   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
5302
5303   /* BASE = BASE - ADJUSTMENT.  */
5304   xops[0] = base;
5305   xops[1] = adjustment;
5306   output_asm_insn ("sub\t%0, %0, %1", xops);
5307   return "";
5308 }
5309
5310 /* Determine whether a frame chain needs to be generated.  */
5311 static bool
5312 aarch64_needs_frame_chain (void)
5313 {
5314   /* Force a frame chain for EH returns so the return address is at FP+8.  */
5315   if (frame_pointer_needed || crtl->calls_eh_return)
5316     return true;
5317
5318   /* A leaf function cannot have calls or write LR.  */
5319   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
5320
5321   /* Don't use a frame chain in leaf functions if leaf frame pointers
5322      are disabled.  */
5323   if (flag_omit_leaf_frame_pointer && is_leaf)
5324     return false;
5325
5326   return aarch64_use_frame_pointer;
5327 }
5328
5329 /* Mark the registers that need to be saved by the callee and calculate
5330    the size of the callee-saved registers area and frame record (both FP
5331    and LR may be omitted).  */
5332 static void
5333 aarch64_layout_frame (void)
5334 {
5335   HOST_WIDE_INT offset = 0;
5336   int regno, last_fp_reg = INVALID_REGNUM;
5337   bool simd_function = aarch64_simd_decl_p (cfun->decl);
5338
5339   cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
5340
5341   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
5342      the mid-end is doing.  */
5343   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
5344
5345 #define SLOT_NOT_REQUIRED (-2)
5346 #define SLOT_REQUIRED     (-1)
5347
5348   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
5349   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
5350
5351   /* If this is a non-leaf simd function with calls we assume that
5352      at least one of those calls is to a non-simd function and thus
5353      we must save V8 to V23 in the prologue.  */
5354
5355   if (simd_function && !crtl->is_leaf)
5356     {
5357       for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5358         if (FP_SIMD_SAVED_REGNUM_P (regno))
5359           df_set_regs_ever_live (regno, true);
5360     }
5361
5362   /* First mark all the registers that really need to be saved...  */
5363   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5364     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5365
5366   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5367     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5368
5369   /* ... that includes the eh data registers (if needed)...  */
5370   if (crtl->calls_eh_return)
5371     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
5372       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
5373         = SLOT_REQUIRED;
5374
5375   /* ... and any callee saved register that dataflow says is live.  */
5376   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5377     if (df_regs_ever_live_p (regno)
5378         && (regno == R30_REGNUM
5379             || !call_used_or_fixed_reg_p (regno)))
5380       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5381
5382   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5383     if (df_regs_ever_live_p (regno)
5384         && (!call_used_or_fixed_reg_p (regno)
5385             || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
5386       {
5387         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5388         last_fp_reg = regno;
5389       }
5390
5391   if (cfun->machine->frame.emit_frame_chain)
5392     {
5393       /* FP and LR are placed in the linkage record.  */
5394       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
5395       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
5396       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
5397       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
5398       offset = 2 * UNITS_PER_WORD;
5399     }
5400
5401   /* With stack-clash, LR must be saved in non-leaf functions.  */
5402   gcc_assert (crtl->is_leaf
5403               || (cfun->machine->frame.reg_offset[R30_REGNUM]
5404                   != SLOT_NOT_REQUIRED));
5405
5406   /* Now assign stack slots for them.  */
5407   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5408     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5409       {
5410         cfun->machine->frame.reg_offset[regno] = offset;
5411         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5412           cfun->machine->frame.wb_candidate1 = regno;
5413         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
5414           cfun->machine->frame.wb_candidate2 = regno;
5415         offset += UNITS_PER_WORD;
5416       }
5417
5418   HOST_WIDE_INT max_int_offset = offset;
5419   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5420   bool has_align_gap = offset != max_int_offset;
5421
5422   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5423     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5424       {
5425         /* If there is an alignment gap between integer and fp callee-saves,
5426            allocate the last fp register to it if possible.  */
5427         if (regno == last_fp_reg
5428             && has_align_gap
5429             && !simd_function
5430             && (offset & 8) == 0)
5431           {
5432             cfun->machine->frame.reg_offset[regno] = max_int_offset;
5433             break;
5434           }
5435
5436         cfun->machine->frame.reg_offset[regno] = offset;
5437         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5438           cfun->machine->frame.wb_candidate1 = regno;
5439         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
5440                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
5441           cfun->machine->frame.wb_candidate2 = regno;
5442         offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
5443       }
5444
5445   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5446
5447   cfun->machine->frame.saved_regs_size = offset;
5448
5449   HOST_WIDE_INT varargs_and_saved_regs_size
5450     = offset + cfun->machine->frame.saved_varargs_size;
5451
5452   cfun->machine->frame.hard_fp_offset
5453     = aligned_upper_bound (varargs_and_saved_regs_size
5454                            + get_frame_size (),
5455                            STACK_BOUNDARY / BITS_PER_UNIT);
5456
5457   /* Both these values are already aligned.  */
5458   gcc_assert (multiple_p (crtl->outgoing_args_size,
5459                           STACK_BOUNDARY / BITS_PER_UNIT));
5460   cfun->machine->frame.frame_size
5461     = (cfun->machine->frame.hard_fp_offset
5462        + crtl->outgoing_args_size);
5463
5464   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
5465
5466   cfun->machine->frame.initial_adjust = 0;
5467   cfun->machine->frame.final_adjust = 0;
5468   cfun->machine->frame.callee_adjust = 0;
5469   cfun->machine->frame.callee_offset = 0;
5470
5471   HOST_WIDE_INT max_push_offset = 0;
5472   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
5473     max_push_offset = 512;
5474   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
5475     max_push_offset = 256;
5476
5477   HOST_WIDE_INT const_size, const_fp_offset;
5478   if (cfun->machine->frame.frame_size.is_constant (&const_size)
5479       && const_size < max_push_offset
5480       && known_eq (crtl->outgoing_args_size, 0))
5481     {
5482       /* Simple, small frame with no outgoing arguments:
5483          stp reg1, reg2, [sp, -frame_size]!
5484          stp reg3, reg4, [sp, 16]  */
5485       cfun->machine->frame.callee_adjust = const_size;
5486     }
5487   else if (known_lt (crtl->outgoing_args_size
5488                      + cfun->machine->frame.saved_regs_size, 512)
5489            && !(cfun->calls_alloca
5490                 && known_lt (cfun->machine->frame.hard_fp_offset,
5491                              max_push_offset)))
5492     {
5493       /* Frame with small outgoing arguments:
5494          sub sp, sp, frame_size
5495          stp reg1, reg2, [sp, outgoing_args_size]
5496          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
5497       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
5498       cfun->machine->frame.callee_offset
5499         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
5500     }
5501   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
5502            && const_fp_offset < max_push_offset)
5503     {
5504       /* Frame with large outgoing arguments but a small local area:
5505          stp reg1, reg2, [sp, -hard_fp_offset]!
5506          stp reg3, reg4, [sp, 16]
5507          sub sp, sp, outgoing_args_size  */
5508       cfun->machine->frame.callee_adjust = const_fp_offset;
5509       cfun->machine->frame.final_adjust
5510         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
5511     }
5512   else
5513     {
5514       /* Frame with large local area and outgoing arguments using frame pointer:
5515          sub sp, sp, hard_fp_offset
5516          stp x29, x30, [sp, 0]
5517          add x29, sp, 0
5518          stp reg3, reg4, [sp, 16]
5519          sub sp, sp, outgoing_args_size  */
5520       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
5521       cfun->machine->frame.final_adjust
5522         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
5523     }
5524
5525   cfun->machine->frame.laid_out = true;
5526 }
5527
5528 /* Return true if the register REGNO is saved on entry to
5529    the current function.  */
5530
5531 static bool
5532 aarch64_register_saved_on_entry (int regno)
5533 {
5534   return cfun->machine->frame.reg_offset[regno] >= 0;
5535 }
5536
5537 /* Return the next register up from REGNO up to LIMIT for the callee
5538    to save.  */
5539
5540 static unsigned
5541 aarch64_next_callee_save (unsigned regno, unsigned limit)
5542 {
5543   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
5544     regno ++;
5545   return regno;
5546 }
5547
5548 /* Push the register number REGNO of mode MODE to the stack with write-back
5549    adjusting the stack by ADJUSTMENT.  */
5550
5551 static void
5552 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
5553                            HOST_WIDE_INT adjustment)
5554  {
5555   rtx base_rtx = stack_pointer_rtx;
5556   rtx insn, reg, mem;
5557
5558   reg = gen_rtx_REG (mode, regno);
5559   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
5560                             plus_constant (Pmode, base_rtx, -adjustment));
5561   mem = gen_frame_mem (mode, mem);
5562
5563   insn = emit_move_insn (mem, reg);
5564   RTX_FRAME_RELATED_P (insn) = 1;
5565 }
5566
5567 /* Generate and return an instruction to store the pair of registers
5568    REG and REG2 of mode MODE to location BASE with write-back adjusting
5569    the stack location BASE by ADJUSTMENT.  */
5570
5571 static rtx
5572 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5573                           HOST_WIDE_INT adjustment)
5574 {
5575   switch (mode)
5576     {
5577     case E_DImode:
5578       return gen_storewb_pairdi_di (base, base, reg, reg2,
5579                                     GEN_INT (-adjustment),
5580                                     GEN_INT (UNITS_PER_WORD - adjustment));
5581     case E_DFmode:
5582       return gen_storewb_pairdf_di (base, base, reg, reg2,
5583                                     GEN_INT (-adjustment),
5584                                     GEN_INT (UNITS_PER_WORD - adjustment));
5585     case E_TFmode:
5586       return gen_storewb_pairtf_di (base, base, reg, reg2,
5587                                     GEN_INT (-adjustment),
5588                                     GEN_INT (UNITS_PER_VREG - adjustment));
5589     default:
5590       gcc_unreachable ();
5591     }
5592 }
5593
5594 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
5595    stack pointer by ADJUSTMENT.  */
5596
5597 static void
5598 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
5599 {
5600   rtx_insn *insn;
5601   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5602
5603   if (regno2 == INVALID_REGNUM)
5604     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
5605
5606   rtx reg1 = gen_rtx_REG (mode, regno1);
5607   rtx reg2 = gen_rtx_REG (mode, regno2);
5608
5609   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
5610                                               reg2, adjustment));
5611   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
5612   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5613   RTX_FRAME_RELATED_P (insn) = 1;
5614 }
5615
5616 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
5617    adjusting it by ADJUSTMENT afterwards.  */
5618
5619 static rtx
5620 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5621                          HOST_WIDE_INT adjustment)
5622 {
5623   switch (mode)
5624     {
5625     case E_DImode:
5626       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
5627                                    GEN_INT (UNITS_PER_WORD));
5628     case E_DFmode:
5629       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
5630                                    GEN_INT (UNITS_PER_WORD));
5631     case E_TFmode:
5632       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
5633                                    GEN_INT (UNITS_PER_VREG));
5634     default:
5635       gcc_unreachable ();
5636     }
5637 }
5638
5639 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
5640    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
5641    into CFI_OPS.  */
5642
5643 static void
5644 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
5645                   rtx *cfi_ops)
5646 {
5647   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5648   rtx reg1 = gen_rtx_REG (mode, regno1);
5649
5650   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
5651
5652   if (regno2 == INVALID_REGNUM)
5653     {
5654       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
5655       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
5656       emit_move_insn (reg1, gen_frame_mem (mode, mem));
5657     }
5658   else
5659     {
5660       rtx reg2 = gen_rtx_REG (mode, regno2);
5661       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5662       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
5663                                           reg2, adjustment));
5664     }
5665 }
5666
5667 /* Generate and return a store pair instruction of mode MODE to store
5668    register REG1 to MEM1 and register REG2 to MEM2.  */
5669
5670 static rtx
5671 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
5672                         rtx reg2)
5673 {
5674   switch (mode)
5675     {
5676     case E_DImode:
5677       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
5678
5679     case E_DFmode:
5680       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
5681
5682     case E_TFmode:
5683       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
5684
5685     default:
5686       gcc_unreachable ();
5687     }
5688 }
5689
5690 /* Generate and regurn a load pair isntruction of mode MODE to load register
5691    REG1 from MEM1 and register REG2 from MEM2.  */
5692
5693 static rtx
5694 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
5695                        rtx mem2)
5696 {
5697   switch (mode)
5698     {
5699     case E_DImode:
5700       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
5701
5702     case E_DFmode:
5703       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
5704
5705     case E_TFmode:
5706       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
5707
5708     default:
5709       gcc_unreachable ();
5710     }
5711 }
5712
5713 /* Return TRUE if return address signing should be enabled for the current
5714    function, otherwise return FALSE.  */
5715
5716 bool
5717 aarch64_return_address_signing_enabled (void)
5718 {
5719   /* This function should only be called after frame laid out.   */
5720   gcc_assert (cfun->machine->frame.laid_out);
5721
5722   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
5723      if its LR is pushed onto stack.  */
5724   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
5725           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
5726               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
5727 }
5728
5729 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
5730 bool
5731 aarch64_bti_enabled (void)
5732 {
5733   return (aarch64_enable_bti == 1);
5734 }
5735
5736 /* Emit code to save the callee-saved registers from register number START
5737    to LIMIT to the stack at the location starting at offset START_OFFSET,
5738    skipping any write-back candidates if SKIP_WB is true.  */
5739
5740 static void
5741 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
5742                            unsigned start, unsigned limit, bool skip_wb)
5743 {
5744   rtx_insn *insn;
5745   unsigned regno;
5746   unsigned regno2;
5747
5748   for (regno = aarch64_next_callee_save (start, limit);
5749        regno <= limit;
5750        regno = aarch64_next_callee_save (regno + 1, limit))
5751     {
5752       rtx reg, mem;
5753       poly_int64 offset;
5754       int offset_diff;
5755
5756       if (skip_wb
5757           && (regno == cfun->machine->frame.wb_candidate1
5758               || regno == cfun->machine->frame.wb_candidate2))
5759         continue;
5760
5761       if (cfun->machine->reg_is_wrapped_separately[regno])
5762        continue;
5763
5764       reg = gen_rtx_REG (mode, regno);
5765       offset = start_offset + cfun->machine->frame.reg_offset[regno];
5766       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5767                                                 offset));
5768
5769       regno2 = aarch64_next_callee_save (regno + 1, limit);
5770       offset_diff = cfun->machine->frame.reg_offset[regno2]
5771                     - cfun->machine->frame.reg_offset[regno];
5772
5773       if (regno2 <= limit
5774           && !cfun->machine->reg_is_wrapped_separately[regno2]
5775           && known_eq (GET_MODE_SIZE (mode), offset_diff))
5776         {
5777           rtx reg2 = gen_rtx_REG (mode, regno2);
5778           rtx mem2;
5779
5780           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5781           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5782                                                      offset));
5783           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
5784                                                     reg2));
5785
5786           /* The first part of a frame-related parallel insn is
5787              always assumed to be relevant to the frame
5788              calculations; subsequent parts, are only
5789              frame-related if explicitly marked.  */
5790           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5791           regno = regno2;
5792         }
5793       else
5794         insn = emit_move_insn (mem, reg);
5795
5796       RTX_FRAME_RELATED_P (insn) = 1;
5797     }
5798 }
5799
5800 /* Emit code to restore the callee registers of mode MODE from register
5801    number START up to and including LIMIT.  Restore from the stack offset
5802    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
5803    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
5804
5805 static void
5806 aarch64_restore_callee_saves (machine_mode mode,
5807                               poly_int64 start_offset, unsigned start,
5808                               unsigned limit, bool skip_wb, rtx *cfi_ops)
5809 {
5810   rtx base_rtx = stack_pointer_rtx;
5811   unsigned regno;
5812   unsigned regno2;
5813   poly_int64 offset;
5814
5815   for (regno = aarch64_next_callee_save (start, limit);
5816        regno <= limit;
5817        regno = aarch64_next_callee_save (regno + 1, limit))
5818     {
5819       if (cfun->machine->reg_is_wrapped_separately[regno])
5820        continue;
5821
5822       rtx reg, mem;
5823       int offset_diff;
5824
5825       if (skip_wb
5826           && (regno == cfun->machine->frame.wb_candidate1
5827               || regno == cfun->machine->frame.wb_candidate2))
5828         continue;
5829
5830       reg = gen_rtx_REG (mode, regno);
5831       offset = start_offset + cfun->machine->frame.reg_offset[regno];
5832       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5833
5834       regno2 = aarch64_next_callee_save (regno + 1, limit);
5835       offset_diff = cfun->machine->frame.reg_offset[regno2]
5836                     - cfun->machine->frame.reg_offset[regno];
5837
5838       if (regno2 <= limit
5839           && !cfun->machine->reg_is_wrapped_separately[regno2]
5840           && known_eq (GET_MODE_SIZE (mode), offset_diff))
5841         {
5842           rtx reg2 = gen_rtx_REG (mode, regno2);
5843           rtx mem2;
5844
5845           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5846           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5847           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5848
5849           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5850           regno = regno2;
5851         }
5852       else
5853         emit_move_insn (reg, mem);
5854       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
5855     }
5856 }
5857
5858 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5859    of MODE.  */
5860
5861 static inline bool
5862 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5863 {
5864   HOST_WIDE_INT multiple;
5865   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5866           && IN_RANGE (multiple, -8, 7));
5867 }
5868
5869 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5870    of MODE.  */
5871
5872 static inline bool
5873 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5874 {
5875   HOST_WIDE_INT multiple;
5876   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5877           && IN_RANGE (multiple, 0, 63));
5878 }
5879
5880 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5881    of MODE.  */
5882
5883 bool
5884 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5885 {
5886   HOST_WIDE_INT multiple;
5887   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5888           && IN_RANGE (multiple, -64, 63));
5889 }
5890
5891 /* Return true if OFFSET is a signed 9-bit value.  */
5892
5893 bool
5894 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
5895                                        poly_int64 offset)
5896 {
5897   HOST_WIDE_INT const_offset;
5898   return (offset.is_constant (&const_offset)
5899           && IN_RANGE (const_offset, -256, 255));
5900 }
5901
5902 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5903    of MODE.  */
5904
5905 static inline bool
5906 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5907 {
5908   HOST_WIDE_INT multiple;
5909   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5910           && IN_RANGE (multiple, -256, 255));
5911 }
5912
5913 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5914    of MODE.  */
5915
5916 static inline bool
5917 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5918 {
5919   HOST_WIDE_INT multiple;
5920   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5921           && IN_RANGE (multiple, 0, 4095));
5922 }
5923
5924 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
5925
5926 static sbitmap
5927 aarch64_get_separate_components (void)
5928 {
5929   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5930   bitmap_clear (components);
5931
5932   /* The registers we need saved to the frame.  */
5933   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5934     if (aarch64_register_saved_on_entry (regno))
5935       {
5936         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5937         if (!frame_pointer_needed)
5938           offset += cfun->machine->frame.frame_size
5939                     - cfun->machine->frame.hard_fp_offset;
5940         /* Check that we can access the stack slot of the register with one
5941            direct load with no adjustments needed.  */
5942         if (offset_12bit_unsigned_scaled_p (DImode, offset))
5943           bitmap_set_bit (components, regno);
5944       }
5945
5946   /* Don't mess with the hard frame pointer.  */
5947   if (frame_pointer_needed)
5948     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5949
5950   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5951   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5952   /* If registers have been chosen to be stored/restored with
5953      writeback don't interfere with them to avoid having to output explicit
5954      stack adjustment instructions.  */
5955   if (reg2 != INVALID_REGNUM)
5956     bitmap_clear_bit (components, reg2);
5957   if (reg1 != INVALID_REGNUM)
5958     bitmap_clear_bit (components, reg1);
5959
5960   bitmap_clear_bit (components, LR_REGNUM);
5961   bitmap_clear_bit (components, SP_REGNUM);
5962
5963   return components;
5964 }
5965
5966 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
5967
5968 static sbitmap
5969 aarch64_components_for_bb (basic_block bb)
5970 {
5971   bitmap in = DF_LIVE_IN (bb);
5972   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5973   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5974   bool simd_function = aarch64_simd_decl_p (cfun->decl);
5975
5976   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5977   bitmap_clear (components);
5978
5979   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
5980   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5981     if ((!call_used_or_fixed_reg_p (regno)
5982         || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5983        && (bitmap_bit_p (in, regno)
5984            || bitmap_bit_p (gen, regno)
5985            || bitmap_bit_p (kill, regno)))
5986       {
5987         unsigned regno2, offset, offset2;
5988         bitmap_set_bit (components, regno);
5989
5990         /* If there is a callee-save at an adjacent offset, add it too
5991            to increase the use of LDP/STP.  */
5992         offset = cfun->machine->frame.reg_offset[regno];
5993         regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5994
5995         if (regno2 <= LAST_SAVED_REGNUM)
5996           {
5997             offset2 = cfun->machine->frame.reg_offset[regno2];
5998             if ((offset & ~8) == (offset2 & ~8))
5999               bitmap_set_bit (components, regno2);
6000           }
6001       }
6002
6003   return components;
6004 }
6005
6006 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
6007    Nothing to do for aarch64.  */
6008
6009 static void
6010 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
6011 {
6012 }
6013
6014 /* Return the next set bit in BMP from START onwards.  Return the total number
6015    of bits in BMP if no set bit is found at or after START.  */
6016
6017 static unsigned int
6018 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
6019 {
6020   unsigned int nbits = SBITMAP_SIZE (bmp);
6021   if (start == nbits)
6022     return start;
6023
6024   gcc_assert (start < nbits);
6025   for (unsigned int i = start; i < nbits; i++)
6026     if (bitmap_bit_p (bmp, i))
6027       return i;
6028
6029   return nbits;
6030 }
6031
6032 /* Do the work for aarch64_emit_prologue_components and
6033    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
6034    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
6035    for these components or the epilogue sequence.  That is, it determines
6036    whether we should emit stores or loads and what kind of CFA notes to attach
6037    to the insns.  Otherwise the logic for the two sequences is very
6038    similar.  */
6039
6040 static void
6041 aarch64_process_components (sbitmap components, bool prologue_p)
6042 {
6043   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
6044                              ? HARD_FRAME_POINTER_REGNUM
6045                              : STACK_POINTER_REGNUM);
6046
6047   unsigned last_regno = SBITMAP_SIZE (components);
6048   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
6049   rtx_insn *insn = NULL;
6050
6051   while (regno != last_regno)
6052     {
6053       /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
6054          so DFmode for the vector registers is enough.  For simd functions
6055          we want to save the low 128 bits.  */
6056       machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
6057
6058       rtx reg = gen_rtx_REG (mode, regno);
6059       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
6060       if (!frame_pointer_needed)
6061         offset += cfun->machine->frame.frame_size
6062                   - cfun->machine->frame.hard_fp_offset;
6063       rtx addr = plus_constant (Pmode, ptr_reg, offset);
6064       rtx mem = gen_frame_mem (mode, addr);
6065
6066       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
6067       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
6068       /* No more registers to handle after REGNO.
6069          Emit a single save/restore and exit.  */
6070       if (regno2 == last_regno)
6071         {
6072           insn = emit_insn (set);
6073           RTX_FRAME_RELATED_P (insn) = 1;
6074           if (prologue_p)
6075             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6076           else
6077             add_reg_note (insn, REG_CFA_RESTORE, reg);
6078           break;
6079         }
6080
6081       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
6082       /* The next register is not of the same class or its offset is not
6083          mergeable with the current one into a pair.  */
6084       if (!satisfies_constraint_Ump (mem)
6085           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
6086           || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
6087           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
6088                        GET_MODE_SIZE (mode)))
6089         {
6090           insn = emit_insn (set);
6091           RTX_FRAME_RELATED_P (insn) = 1;
6092           if (prologue_p)
6093             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6094           else
6095             add_reg_note (insn, REG_CFA_RESTORE, reg);
6096
6097           regno = regno2;
6098           continue;
6099         }
6100
6101       /* REGNO2 can be saved/restored in a pair with REGNO.  */
6102       rtx reg2 = gen_rtx_REG (mode, regno2);
6103       if (!frame_pointer_needed)
6104         offset2 += cfun->machine->frame.frame_size
6105                   - cfun->machine->frame.hard_fp_offset;
6106       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
6107       rtx mem2 = gen_frame_mem (mode, addr2);
6108       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
6109                              : gen_rtx_SET (reg2, mem2);
6110
6111       if (prologue_p)
6112         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
6113       else
6114         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
6115
6116       RTX_FRAME_RELATED_P (insn) = 1;
6117       if (prologue_p)
6118         {
6119           add_reg_note (insn, REG_CFA_OFFSET, set);
6120           add_reg_note (insn, REG_CFA_OFFSET, set2);
6121         }
6122       else
6123         {
6124           add_reg_note (insn, REG_CFA_RESTORE, reg);
6125           add_reg_note (insn, REG_CFA_RESTORE, reg2);
6126         }
6127
6128       regno = aarch64_get_next_set_bit (components, regno2 + 1);
6129     }
6130 }
6131
6132 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
6133
6134 static void
6135 aarch64_emit_prologue_components (sbitmap components)
6136 {
6137   aarch64_process_components (components, true);
6138 }
6139
6140 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
6141
6142 static void
6143 aarch64_emit_epilogue_components (sbitmap components)
6144 {
6145   aarch64_process_components (components, false);
6146 }
6147
6148 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
6149
6150 static void
6151 aarch64_set_handled_components (sbitmap components)
6152 {
6153   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6154     if (bitmap_bit_p (components, regno))
6155       cfun->machine->reg_is_wrapped_separately[regno] = true;
6156 }
6157
6158 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
6159    determining the probe offset for alloca.  */
6160
6161 static HOST_WIDE_INT
6162 aarch64_stack_clash_protection_alloca_probe_range (void)
6163 {
6164   return STACK_CLASH_CALLER_GUARD;
6165 }
6166
6167
6168 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
6169    registers.  If POLY_SIZE is not large enough to require a probe this function
6170    will only adjust the stack.  When allocating the stack space
6171    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
6172    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
6173    arguments.  If we are then we ensure that any allocation larger than the ABI
6174    defined buffer needs a probe so that the invariant of having a 1KB buffer is
6175    maintained.
6176
6177    We emit barriers after each stack adjustment to prevent optimizations from
6178    breaking the invariant that we never drop the stack more than a page.  This
6179    invariant is needed to make it easier to correctly handle asynchronous
6180    events, e.g. if we were to allow the stack to be dropped by more than a page
6181    and then have multiple probes up and we take a signal somewhere in between
6182    then the signal handler doesn't know the state of the stack and can make no
6183    assumptions about which pages have been probed.  */
6184
6185 static void
6186 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
6187                                         poly_int64 poly_size,
6188                                         bool frame_related_p,
6189                                         bool final_adjustment_p)
6190 {
6191   HOST_WIDE_INT guard_size
6192     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6193   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6194   /* When doing the final adjustment for the outgoing argument size we can't
6195      assume that LR was saved at position 0.  So subtract it's offset from the
6196      ABI safe buffer so that we don't accidentally allow an adjustment that
6197      would result in an allocation larger than the ABI buffer without
6198      probing.  */
6199   HOST_WIDE_INT min_probe_threshold
6200     = final_adjustment_p
6201       ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
6202       : guard_size - guard_used_by_caller;
6203
6204   poly_int64 frame_size = cfun->machine->frame.frame_size;
6205
6206   /* We should always have a positive probe threshold.  */
6207   gcc_assert (min_probe_threshold > 0);
6208
6209   if (flag_stack_clash_protection && !final_adjustment_p)
6210     {
6211       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6212       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6213
6214       if (known_eq (frame_size, 0))
6215         {
6216           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
6217         }
6218       else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
6219                && known_lt (final_adjust, guard_used_by_caller))
6220         {
6221           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
6222         }
6223     }
6224
6225   /* If SIZE is not large enough to require probing, just adjust the stack and
6226      exit.  */
6227   if (known_lt (poly_size, min_probe_threshold)
6228       || !flag_stack_clash_protection)
6229     {
6230       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
6231       return;
6232     }
6233
6234   HOST_WIDE_INT size;
6235   /* Handle the SVE non-constant case first.  */
6236   if (!poly_size.is_constant (&size))
6237     {
6238      if (dump_file)
6239       {
6240         fprintf (dump_file, "Stack clash SVE prologue: ");
6241         print_dec (poly_size, dump_file);
6242         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
6243       }
6244
6245       /* First calculate the amount of bytes we're actually spilling.  */
6246       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
6247                           poly_size, temp1, temp2, false, true);
6248
6249       rtx_insn *insn = get_last_insn ();
6250
6251       if (frame_related_p)
6252         {
6253           /* This is done to provide unwinding information for the stack
6254              adjustments we're about to do, however to prevent the optimizers
6255              from removing the R11 move and leaving the CFA note (which would be
6256              very wrong) we tie the old and new stack pointer together.
6257              The tie will expand to nothing but the optimizers will not touch
6258              the instruction.  */
6259           rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6260           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
6261           emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
6262
6263           /* We want the CFA independent of the stack pointer for the
6264              duration of the loop.  */
6265           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
6266           RTX_FRAME_RELATED_P (insn) = 1;
6267         }
6268
6269       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
6270       rtx guard_const = gen_int_mode (guard_size, Pmode);
6271
6272       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
6273                                                    stack_pointer_rtx, temp1,
6274                                                    probe_const, guard_const));
6275
6276       /* Now reset the CFA register if needed.  */
6277       if (frame_related_p)
6278         {
6279           add_reg_note (insn, REG_CFA_DEF_CFA,
6280                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
6281                                       gen_int_mode (poly_size, Pmode)));
6282           RTX_FRAME_RELATED_P (insn) = 1;
6283         }
6284
6285       return;
6286     }
6287
6288   if (dump_file)
6289     fprintf (dump_file,
6290              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
6291              " bytes, probing will be required.\n", size);
6292
6293   /* Round size to the nearest multiple of guard_size, and calculate the
6294      residual as the difference between the original size and the rounded
6295      size.  */
6296   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
6297   HOST_WIDE_INT residual = size - rounded_size;
6298
6299   /* We can handle a small number of allocations/probes inline.  Otherwise
6300      punt to a loop.  */
6301   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
6302     {
6303       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
6304         {
6305           aarch64_sub_sp (NULL, temp2, guard_size, true);
6306           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6307                                            guard_used_by_caller));
6308           emit_insn (gen_blockage ());
6309         }
6310       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
6311     }
6312   else
6313     {
6314       /* Compute the ending address.  */
6315       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
6316                           temp1, NULL, false, true);
6317       rtx_insn *insn = get_last_insn ();
6318
6319       /* For the initial allocation, we don't have a frame pointer
6320          set up, so we always need CFI notes.  If we're doing the
6321          final allocation, then we may have a frame pointer, in which
6322          case it is the CFA, otherwise we need CFI notes.
6323
6324          We can determine which allocation we are doing by looking at
6325          the value of FRAME_RELATED_P since the final allocations are not
6326          frame related.  */
6327       if (frame_related_p)
6328         {
6329           /* We want the CFA independent of the stack pointer for the
6330              duration of the loop.  */
6331           add_reg_note (insn, REG_CFA_DEF_CFA,
6332                         plus_constant (Pmode, temp1, rounded_size));
6333           RTX_FRAME_RELATED_P (insn) = 1;
6334         }
6335
6336       /* This allocates and probes the stack.  Note that this re-uses some of
6337          the existing Ada stack protection code.  However we are guaranteed not
6338          to enter the non loop or residual branches of that code.
6339
6340          The non-loop part won't be entered because if our allocation amount
6341          doesn't require a loop, the case above would handle it.
6342
6343          The residual amount won't be entered because TEMP1 is a mutliple of
6344          the allocation size.  The residual will always be 0.  As such, the only
6345          part we are actually using from that code is the loop setup.  The
6346          actual probing is done in aarch64_output_probe_stack_range.  */
6347       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
6348                                                stack_pointer_rtx, temp1));
6349
6350       /* Now reset the CFA register if needed.  */
6351       if (frame_related_p)
6352         {
6353           add_reg_note (insn, REG_CFA_DEF_CFA,
6354                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
6355           RTX_FRAME_RELATED_P (insn) = 1;
6356         }
6357
6358       emit_insn (gen_blockage ());
6359       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
6360     }
6361
6362   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
6363      be probed.  This maintains the requirement that each page is probed at
6364      least once.  For initial probing we probe only if the allocation is
6365      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
6366      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
6367      GUARD_SIZE.  This works that for any allocation that is large enough to
6368      trigger a probe here, we'll have at least one, and if they're not large
6369      enough for this code to emit anything for them, The page would have been
6370      probed by the saving of FP/LR either by this function or any callees.  If
6371      we don't have any callees then we won't have more stack adjustments and so
6372      are still safe.  */
6373   if (residual)
6374     {
6375       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
6376       /* If we're doing final adjustments, and we've done any full page
6377          allocations then any residual needs to be probed.  */
6378       if (final_adjustment_p && rounded_size != 0)
6379         min_probe_threshold = 0;
6380       /* If doing a small final adjustment, we always probe at offset 0.
6381          This is done to avoid issues when LR is not at position 0 or when
6382          the final adjustment is smaller than the probing offset.  */
6383       else if (final_adjustment_p && rounded_size == 0)
6384         residual_probe_offset = 0;
6385
6386       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
6387       if (residual >= min_probe_threshold)
6388         {
6389           if (dump_file)
6390             fprintf (dump_file,
6391                      "Stack clash AArch64 prologue residuals: "
6392                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
6393                      "\n", residual);
6394
6395             emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6396                                              residual_probe_offset));
6397           emit_insn (gen_blockage ());
6398         }
6399     }
6400 }
6401
6402 /* Return 1 if the register is used by the epilogue.  We need to say the
6403    return register is used, but only after epilogue generation is complete.
6404    Note that in the case of sibcalls, the values "used by the epilogue" are
6405    considered live at the start of the called function.
6406
6407    For SIMD functions we need to return 1 for FP registers that are saved and
6408    restored by a function but are not zero in call_used_regs.  If we do not do
6409    this optimizations may remove the restore of the register.  */
6410
6411 int
6412 aarch64_epilogue_uses (int regno)
6413 {
6414   if (epilogue_completed)
6415     {
6416       if (regno == LR_REGNUM)
6417         return 1;
6418       if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
6419         return 1;
6420     }
6421   return 0;
6422 }
6423
6424 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6425    is saved at BASE + OFFSET.  */
6426
6427 static void
6428 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
6429                             rtx base, poly_int64 offset)
6430 {
6431   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
6432   add_reg_note (insn, REG_CFA_EXPRESSION,
6433                 gen_rtx_SET (mem, regno_reg_rtx[reg]));
6434 }
6435
6436 /* AArch64 stack frames generated by this compiler look like:
6437
6438         +-------------------------------+
6439         |                               |
6440         |  incoming stack arguments     |
6441         |                               |
6442         +-------------------------------+
6443         |                               | <-- incoming stack pointer (aligned)
6444         |  callee-allocated save area   |
6445         |  for register varargs         |
6446         |                               |
6447         +-------------------------------+
6448         |  local variables              | <-- frame_pointer_rtx
6449         |                               |
6450         +-------------------------------+
6451         |  padding                      | \
6452         +-------------------------------+  |
6453         |  callee-saved registers       |  | frame.saved_regs_size
6454         +-------------------------------+  |
6455         |  LR'                          |  |
6456         +-------------------------------+  |
6457         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
6458         +-------------------------------+
6459         |  dynamic allocation           |
6460         +-------------------------------+
6461         |  padding                      |
6462         +-------------------------------+
6463         |  outgoing stack arguments     | <-- arg_pointer
6464         |                               |
6465         +-------------------------------+
6466         |                               | <-- stack_pointer_rtx (aligned)
6467
6468    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
6469    but leave frame_pointer_rtx and hard_frame_pointer_rtx
6470    unchanged.
6471
6472    By default for stack-clash we assume the guard is at least 64KB, but this
6473    value is configurable to either 4KB or 64KB.  We also force the guard size to
6474    be the same as the probing interval and both values are kept in sync.
6475
6476    With those assumptions the callee can allocate up to 63KB (or 3KB depending
6477    on the guard size) of stack space without probing.
6478
6479    When probing is needed, we emit a probe at the start of the prologue
6480    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
6481
6482    We have to track how much space has been allocated and the only stores
6483    to the stack we track as implicit probes are the FP/LR stores.
6484
6485    For outgoing arguments we probe if the size is larger than 1KB, such that
6486    the ABI specified buffer is maintained for the next callee.
6487
6488    The following registers are reserved during frame layout and should not be
6489    used for any other purpose:
6490
6491    - r11: Used by stack clash protection when SVE is enabled.
6492    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
6493    - r14 and r15: Used for speculation tracking.
6494    - r16(IP0), r17(IP1): Used by indirect tailcalls.
6495    - r30(LR), r29(FP): Used by standard frame layout.
6496
6497    These registers must be avoided in frame layout related code unless the
6498    explicit intention is to interact with one of the features listed above.  */
6499
6500 /* Generate the prologue instructions for entry into a function.
6501    Establish the stack frame by decreasing the stack pointer with a
6502    properly calculated size and, if necessary, create a frame record
6503    filled with the values of LR and previous frame pointer.  The
6504    current FP is also set up if it is in use.  */
6505
6506 void
6507 aarch64_expand_prologue (void)
6508 {
6509   poly_int64 frame_size = cfun->machine->frame.frame_size;
6510   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6511   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6512   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6513   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6514   unsigned reg1 = cfun->machine->frame.wb_candidate1;
6515   unsigned reg2 = cfun->machine->frame.wb_candidate2;
6516   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
6517   rtx_insn *insn;
6518
6519   /* Sign return address for functions.  */
6520   if (aarch64_return_address_signing_enabled ())
6521     {
6522       switch (aarch64_ra_sign_key)
6523         {
6524           case AARCH64_KEY_A:
6525             insn = emit_insn (gen_paciasp ());
6526             break;
6527           case AARCH64_KEY_B:
6528             insn = emit_insn (gen_pacibsp ());
6529             break;
6530           default:
6531             gcc_unreachable ();
6532         }
6533       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6534       RTX_FRAME_RELATED_P (insn) = 1;
6535     }
6536
6537   if (flag_stack_usage_info)
6538     current_function_static_stack_size = constant_lower_bound (frame_size);
6539
6540   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
6541     {
6542       if (crtl->is_leaf && !cfun->calls_alloca)
6543         {
6544           if (maybe_gt (frame_size, PROBE_INTERVAL)
6545               && maybe_gt (frame_size, get_stack_check_protect ()))
6546             aarch64_emit_probe_stack_range (get_stack_check_protect (),
6547                                             (frame_size
6548                                              - get_stack_check_protect ()));
6549         }
6550       else if (maybe_gt (frame_size, 0))
6551         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
6552     }
6553
6554   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6555   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6556
6557   /* In theory we should never have both an initial adjustment
6558      and a callee save adjustment.  Verify that is the case since the
6559      code below does not handle it for -fstack-clash-protection.  */
6560   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
6561
6562   /* Will only probe if the initial adjustment is larger than the guard
6563      less the amount of the guard reserved for use by the caller's
6564      outgoing args.  */
6565   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
6566                                           true, false);
6567
6568   if (callee_adjust != 0)
6569     aarch64_push_regs (reg1, reg2, callee_adjust);
6570
6571   if (emit_frame_chain)
6572     {
6573       poly_int64 reg_offset = callee_adjust;
6574       if (callee_adjust == 0)
6575         {
6576           reg1 = R29_REGNUM;
6577           reg2 = R30_REGNUM;
6578           reg_offset = callee_offset;
6579           aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
6580         }
6581       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
6582                           stack_pointer_rtx, callee_offset,
6583                           tmp1_rtx, tmp0_rtx, frame_pointer_needed);
6584       if (frame_pointer_needed && !frame_size.is_constant ())
6585         {
6586           /* Variable-sized frames need to describe the save slot
6587              address using DW_CFA_expression rather than DW_CFA_offset.
6588              This means that, without taking further action, the
6589              locations of the registers that we've already saved would
6590              remain based on the stack pointer even after we redefine
6591              the CFA based on the frame pointer.  We therefore need new
6592              DW_CFA_expressions to re-express the save slots with addresses
6593              based on the frame pointer.  */
6594           rtx_insn *insn = get_last_insn ();
6595           gcc_assert (RTX_FRAME_RELATED_P (insn));
6596
6597           /* Add an explicit CFA definition if this was previously
6598              implicit.  */
6599           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
6600             {
6601               rtx src = plus_constant (Pmode, stack_pointer_rtx,
6602                                        callee_offset);
6603               add_reg_note (insn, REG_CFA_ADJUST_CFA,
6604                             gen_rtx_SET (hard_frame_pointer_rtx, src));
6605             }
6606
6607           /* Change the save slot expressions for the registers that
6608              we've already saved.  */
6609           reg_offset -= callee_offset;
6610           aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
6611                                       reg_offset + UNITS_PER_WORD);
6612           aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
6613                                       reg_offset);
6614         }
6615       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
6616     }
6617
6618   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6619                              callee_adjust != 0 || emit_frame_chain);
6620   if (aarch64_simd_decl_p (cfun->decl))
6621     aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6622                                callee_adjust != 0 || emit_frame_chain);
6623   else
6624     aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6625                                callee_adjust != 0 || emit_frame_chain);
6626
6627   /* We may need to probe the final adjustment if it is larger than the guard
6628      that is assumed by the called.  */
6629   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
6630                                           !frame_pointer_needed, true);
6631 }
6632
6633 /* Return TRUE if we can use a simple_return insn.
6634
6635    This function checks whether the callee saved stack is empty, which
6636    means no restore actions are need. The pro_and_epilogue will use
6637    this to check whether shrink-wrapping opt is feasible.  */
6638
6639 bool
6640 aarch64_use_return_insn_p (void)
6641 {
6642   if (!reload_completed)
6643     return false;
6644
6645   if (crtl->profile)
6646     return false;
6647
6648   return known_eq (cfun->machine->frame.frame_size, 0);
6649 }
6650
6651 /* Return false for non-leaf SIMD functions in order to avoid
6652    shrink-wrapping them.  Doing this will lose the necessary
6653    save/restore of FP registers.  */
6654
6655 bool
6656 aarch64_use_simple_return_insn_p (void)
6657 {
6658   if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
6659     return false;
6660
6661   return true;
6662 }
6663
6664 /* Generate the epilogue instructions for returning from a function.
6665    This is almost exactly the reverse of the prolog sequence, except
6666    that we need to insert barriers to avoid scheduling loads that read
6667    from a deallocated stack, and we optimize the unwind records by
6668    emitting them all together if possible.  */
6669 void
6670 aarch64_expand_epilogue (bool for_sibcall)
6671 {
6672   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6673   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6674   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6675   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6676   unsigned reg1 = cfun->machine->frame.wb_candidate1;
6677   unsigned reg2 = cfun->machine->frame.wb_candidate2;
6678   rtx cfi_ops = NULL;
6679   rtx_insn *insn;
6680   /* A stack clash protection prologue may not have left EP0_REGNUM or
6681      EP1_REGNUM in a usable state.  The same is true for allocations
6682      with an SVE component, since we then need both temporary registers
6683      for each allocation.  For stack clash we are in a usable state if
6684      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
6685   HOST_WIDE_INT guard_size
6686     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6687   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6688
6689   /* We can re-use the registers when the allocation amount is smaller than
6690      guard_size - guard_used_by_caller because we won't be doing any probes
6691      then.  In such situations the register should remain live with the correct
6692      value.  */
6693   bool can_inherit_p = (initial_adjust.is_constant ()
6694                         && final_adjust.is_constant ())
6695                         && (!flag_stack_clash_protection
6696                             || known_lt (initial_adjust,
6697                                          guard_size - guard_used_by_caller));
6698
6699   /* We need to add memory barrier to prevent read from deallocated stack.  */
6700   bool need_barrier_p
6701     = maybe_ne (get_frame_size ()
6702                 + cfun->machine->frame.saved_varargs_size, 0);
6703
6704   /* Emit a barrier to prevent loads from a deallocated stack.  */
6705   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
6706       || cfun->calls_alloca
6707       || crtl->calls_eh_return)
6708     {
6709       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6710       need_barrier_p = false;
6711     }
6712
6713   /* Restore the stack pointer from the frame pointer if it may not
6714      be the same as the stack pointer.  */
6715   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6716   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6717   if (frame_pointer_needed
6718       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
6719     /* If writeback is used when restoring callee-saves, the CFA
6720        is restored on the instruction doing the writeback.  */
6721     aarch64_add_offset (Pmode, stack_pointer_rtx,
6722                         hard_frame_pointer_rtx, -callee_offset,
6723                         tmp1_rtx, tmp0_rtx, callee_adjust == 0);
6724   else
6725      /* The case where we need to re-use the register here is very rare, so
6726         avoid the complicated condition and just always emit a move if the
6727         immediate doesn't fit.  */
6728      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
6729
6730   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6731                                 callee_adjust != 0, &cfi_ops);
6732   if (aarch64_simd_decl_p (cfun->decl))
6733     aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6734                                   callee_adjust != 0, &cfi_ops);
6735   else
6736     aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6737                                   callee_adjust != 0, &cfi_ops);
6738
6739   if (need_barrier_p)
6740     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6741
6742   if (callee_adjust != 0)
6743     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
6744
6745   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
6746     {
6747       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
6748       insn = get_last_insn ();
6749       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
6750       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
6751       RTX_FRAME_RELATED_P (insn) = 1;
6752       cfi_ops = NULL;
6753     }
6754
6755   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
6756      add restriction on emit_move optimization to leaf functions.  */
6757   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
6758                   (!can_inherit_p || !crtl->is_leaf
6759                    || df_regs_ever_live_p (EP0_REGNUM)));
6760
6761   if (cfi_ops)
6762     {
6763       /* Emit delayed restores and reset the CFA to be SP.  */
6764       insn = get_last_insn ();
6765       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
6766       REG_NOTES (insn) = cfi_ops;
6767       RTX_FRAME_RELATED_P (insn) = 1;
6768     }
6769
6770   /* We prefer to emit the combined return/authenticate instruction RETAA,
6771      however there are three cases in which we must instead emit an explicit
6772      authentication instruction.
6773
6774         1) Sibcalls don't return in a normal way, so if we're about to call one
6775            we must authenticate.
6776
6777         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
6778            generating code for !TARGET_ARMV8_3 we can't use it and must
6779            explicitly authenticate.
6780
6781         3) On an eh_return path we make extra stack adjustments to update the
6782            canonical frame address to be the exception handler's CFA.  We want
6783            to authenticate using the CFA of the function which calls eh_return.
6784     */
6785   if (aarch64_return_address_signing_enabled ()
6786       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
6787     {
6788       switch (aarch64_ra_sign_key)
6789         {
6790           case AARCH64_KEY_A:
6791             insn = emit_insn (gen_autiasp ());
6792             break;
6793           case AARCH64_KEY_B:
6794             insn = emit_insn (gen_autibsp ());
6795             break;
6796           default:
6797             gcc_unreachable ();
6798         }
6799       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6800       RTX_FRAME_RELATED_P (insn) = 1;
6801     }
6802
6803   /* Stack adjustment for exception handler.  */
6804   if (crtl->calls_eh_return && !for_sibcall)
6805     {
6806       /* We need to unwind the stack by the offset computed by
6807          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
6808          to be SP; letting the CFA move during this adjustment
6809          is just as correct as retaining the CFA from the body
6810          of the function.  Therefore, do nothing special.  */
6811       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
6812     }
6813
6814   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
6815   if (!for_sibcall)
6816     emit_jump_insn (ret_rtx);
6817 }
6818
6819 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
6820    normally or return to a previous frame after unwinding.
6821
6822    An EH return uses a single shared return sequence.  The epilogue is
6823    exactly like a normal epilogue except that it has an extra input
6824    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
6825    that must be applied after the frame has been destroyed.  An extra label
6826    is inserted before the epilogue which initializes this register to zero,
6827    and this is the entry point for a normal return.
6828
6829    An actual EH return updates the return address, initializes the stack
6830    adjustment and jumps directly into the epilogue (bypassing the zeroing
6831    of the adjustment).  Since the return address is typically saved on the
6832    stack when a function makes a call, the saved LR must be updated outside
6833    the epilogue.
6834
6835    This poses problems as the store is generated well before the epilogue,
6836    so the offset of LR is not known yet.  Also optimizations will remove the
6837    store as it appears dead, even after the epilogue is generated (as the
6838    base or offset for loading LR is different in many cases).
6839
6840    To avoid these problems this implementation forces the frame pointer
6841    in eh_return functions so that the location of LR is fixed and known early.
6842    It also marks the store volatile, so no optimization is permitted to
6843    remove the store.  */
6844 rtx
6845 aarch64_eh_return_handler_rtx (void)
6846 {
6847   rtx tmp = gen_frame_mem (Pmode,
6848     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
6849
6850   /* Mark the store volatile, so no optimization is permitted to remove it.  */
6851   MEM_VOLATILE_P (tmp) = true;
6852   return tmp;
6853 }
6854
6855 /* Output code to add DELTA to the first argument, and then jump
6856    to FUNCTION.  Used for C++ multiple inheritance.  */
6857 static void
6858 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
6859                          HOST_WIDE_INT delta,
6860                          HOST_WIDE_INT vcall_offset,
6861                          tree function)
6862 {
6863   /* The this pointer is always in x0.  Note that this differs from
6864      Arm where the this pointer maybe bumped to r1 if r0 is required
6865      to return a pointer to an aggregate.  On AArch64 a result value
6866      pointer will be in x8.  */
6867   int this_regno = R0_REGNUM;
6868   rtx this_rtx, temp0, temp1, addr, funexp;
6869   rtx_insn *insn;
6870   const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
6871
6872   if (aarch64_bti_enabled ())
6873     emit_insn (gen_bti_c());
6874
6875   reload_completed = 1;
6876   emit_note (NOTE_INSN_PROLOGUE_END);
6877
6878   this_rtx = gen_rtx_REG (Pmode, this_regno);
6879   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
6880   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
6881
6882   if (vcall_offset == 0)
6883     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
6884   else
6885     {
6886       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
6887
6888       addr = this_rtx;
6889       if (delta != 0)
6890         {
6891           if (delta >= -256 && delta < 256)
6892             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
6893                                        plus_constant (Pmode, this_rtx, delta));
6894           else
6895             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
6896                                 temp1, temp0, false);
6897         }
6898
6899       if (Pmode == ptr_mode)
6900         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
6901       else
6902         aarch64_emit_move (temp0,
6903                            gen_rtx_ZERO_EXTEND (Pmode,
6904                                                 gen_rtx_MEM (ptr_mode, addr)));
6905
6906       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
6907           addr = plus_constant (Pmode, temp0, vcall_offset);
6908       else
6909         {
6910           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
6911                                           Pmode);
6912           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
6913         }
6914
6915       if (Pmode == ptr_mode)
6916         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
6917       else
6918         aarch64_emit_move (temp1,
6919                            gen_rtx_SIGN_EXTEND (Pmode,
6920                                                 gen_rtx_MEM (ptr_mode, addr)));
6921
6922       emit_insn (gen_add2_insn (this_rtx, temp1));
6923     }
6924
6925   /* Generate a tail call to the target function.  */
6926   if (!TREE_USED (function))
6927     {
6928       assemble_external (function);
6929       TREE_USED (function) = 1;
6930     }
6931   funexp = XEXP (DECL_RTL (function), 0);
6932   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6933   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
6934   SIBLING_CALL_P (insn) = 1;
6935
6936   insn = get_insns ();
6937   shorten_branches (insn);
6938
6939   assemble_start_function (thunk, fnname);
6940   final_start_function (insn, file, 1);
6941   final (insn, file, 1);
6942   final_end_function ();
6943   assemble_end_function (thunk, fnname);
6944
6945   /* Stop pretending to be a post-reload pass.  */
6946   reload_completed = 0;
6947 }
6948
6949 static bool
6950 aarch64_tls_referenced_p (rtx x)
6951 {
6952   if (!TARGET_HAVE_TLS)
6953     return false;
6954   subrtx_iterator::array_type array;
6955   FOR_EACH_SUBRTX (iter, array, x, ALL)
6956     {
6957       const_rtx x = *iter;
6958       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6959         return true;
6960       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6961          TLS offsets, not real symbol references.  */
6962       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6963         iter.skip_subrtxes ();
6964     }
6965   return false;
6966 }
6967
6968
6969 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6970    a left shift of 0 or 12 bits.  */
6971 bool
6972 aarch64_uimm12_shift (HOST_WIDE_INT val)
6973 {
6974   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6975           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6976           );
6977 }
6978
6979 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6980    that can be created with a left shift of 0 or 12.  */
6981 static HOST_WIDE_INT
6982 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6983 {
6984   /* Check to see if the value fits in 24 bits, as that is the maximum we can
6985      handle correctly.  */
6986   gcc_assert ((val & 0xffffff) == val);
6987
6988   if (((val & 0xfff) << 0) == val)
6989     return val;
6990
6991   return val & (0xfff << 12);
6992 }
6993
6994 /* Return true if val is an immediate that can be loaded into a
6995    register by a MOVZ instruction.  */
6996 static bool
6997 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6998 {
6999   if (GET_MODE_SIZE (mode) > 4)
7000     {
7001       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
7002           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
7003         return 1;
7004     }
7005   else
7006     {
7007       /* Ignore sign extension.  */
7008       val &= (HOST_WIDE_INT) 0xffffffff;
7009     }
7010   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
7011           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
7012 }
7013
7014 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
7015    64-bit (DImode) integer.  */
7016
7017 static unsigned HOST_WIDE_INT
7018 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
7019 {
7020   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
7021   while (size < 64)
7022     {
7023       val &= (HOST_WIDE_INT_1U << size) - 1;
7024       val |= val << size;
7025       size *= 2;
7026     }
7027   return val;
7028 }
7029
7030 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
7031
7032 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
7033   {
7034     0x0000000100000001ull,
7035     0x0001000100010001ull,
7036     0x0101010101010101ull,
7037     0x1111111111111111ull,
7038     0x5555555555555555ull,
7039   };
7040
7041
7042 /* Return true if val is a valid bitmask immediate.  */
7043
7044 bool
7045 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
7046 {
7047   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
7048   int bits;
7049
7050   /* Check for a single sequence of one bits and return quickly if so.
7051      The special cases of all ones and all zeroes returns false.  */
7052   val = aarch64_replicate_bitmask_imm (val_in, mode);
7053   tmp = val + (val & -val);
7054
7055   if (tmp == (tmp & -tmp))
7056     return (val + 1) > 1;
7057
7058   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
7059   if (mode == SImode)
7060     val = (val << 32) | (val & 0xffffffff);
7061
7062   /* Invert if the immediate doesn't start with a zero bit - this means we
7063      only need to search for sequences of one bits.  */
7064   if (val & 1)
7065     val = ~val;
7066
7067   /* Find the first set bit and set tmp to val with the first sequence of one
7068      bits removed.  Return success if there is a single sequence of ones.  */
7069   first_one = val & -val;
7070   tmp = val & (val + first_one);
7071
7072   if (tmp == 0)
7073     return true;
7074
7075   /* Find the next set bit and compute the difference in bit position.  */
7076   next_one = tmp & -tmp;
7077   bits = clz_hwi (first_one) - clz_hwi (next_one);
7078   mask = val ^ tmp;
7079
7080   /* Check the bit position difference is a power of 2, and that the first
7081      sequence of one bits fits within 'bits' bits.  */
7082   if ((mask >> bits) != 0 || bits != (bits & -bits))
7083     return false;
7084
7085   /* Check the sequence of one bits is repeated 64/bits times.  */
7086   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
7087 }
7088
7089 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
7090    Assumed precondition: VAL_IN Is not zero.  */
7091
7092 unsigned HOST_WIDE_INT
7093 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
7094 {
7095   int lowest_bit_set = ctz_hwi (val_in);
7096   int highest_bit_set = floor_log2 (val_in);
7097   gcc_assert (val_in != 0);
7098
7099   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
7100           (HOST_WIDE_INT_1U << lowest_bit_set));
7101 }
7102
7103 /* Create constant where bits outside of lowest bit set to highest bit set
7104    are set to 1.  */
7105
7106 unsigned HOST_WIDE_INT
7107 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
7108 {
7109   return val_in | ~aarch64_and_split_imm1 (val_in);
7110 }
7111
7112 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
7113
7114 bool
7115 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
7116 {
7117   scalar_int_mode int_mode;
7118   if (!is_a <scalar_int_mode> (mode, &int_mode))
7119     return false;
7120
7121   if (aarch64_bitmask_imm (val_in, int_mode))
7122     return false;
7123
7124   if (aarch64_move_imm (val_in, int_mode))
7125     return false;
7126
7127   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
7128
7129   return aarch64_bitmask_imm (imm2, int_mode);
7130 }
7131
7132 /* Return true if val is an immediate that can be loaded into a
7133    register in a single instruction.  */
7134 bool
7135 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
7136 {
7137   scalar_int_mode int_mode;
7138   if (!is_a <scalar_int_mode> (mode, &int_mode))
7139     return false;
7140
7141   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
7142     return 1;
7143   return aarch64_bitmask_imm (val, int_mode);
7144 }
7145
7146 static bool
7147 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
7148 {
7149   rtx base, offset;
7150
7151   if (GET_CODE (x) == HIGH)
7152     return true;
7153
7154   /* There's no way to calculate VL-based values using relocations.  */
7155   subrtx_iterator::array_type array;
7156   FOR_EACH_SUBRTX (iter, array, x, ALL)
7157     if (GET_CODE (*iter) == CONST_POLY_INT)
7158       return true;
7159
7160   split_const (x, &base, &offset);
7161   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
7162     {
7163       if (aarch64_classify_symbol (base, INTVAL (offset))
7164           != SYMBOL_FORCE_TO_MEM)
7165         return true;
7166       else
7167         /* Avoid generating a 64-bit relocation in ILP32; leave
7168            to aarch64_expand_mov_immediate to handle it properly.  */
7169         return mode != ptr_mode;
7170     }
7171
7172   return aarch64_tls_referenced_p (x);
7173 }
7174
7175 /* Implement TARGET_CASE_VALUES_THRESHOLD.
7176    The expansion for a table switch is quite expensive due to the number
7177    of instructions, the table lookup and hard to predict indirect jump.
7178    When optimizing for speed, and -O3 enabled, use the per-core tuning if
7179    set, otherwise use tables for > 16 cases as a tradeoff between size and
7180    performance.  When optimizing for size, use the default setting.  */
7181
7182 static unsigned int
7183 aarch64_case_values_threshold (void)
7184 {
7185   /* Use the specified limit for the number of cases before using jump
7186      tables at higher optimization levels.  */
7187   if (optimize > 2
7188       && selected_cpu->tune->max_case_values != 0)
7189     return selected_cpu->tune->max_case_values;
7190   else
7191     return optimize_size ? default_case_values_threshold () : 17;
7192 }
7193
7194 /* Return true if register REGNO is a valid index register.
7195    STRICT_P is true if REG_OK_STRICT is in effect.  */
7196
7197 bool
7198 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
7199 {
7200   if (!HARD_REGISTER_NUM_P (regno))
7201     {
7202       if (!strict_p)
7203         return true;
7204
7205       if (!reg_renumber)
7206         return false;
7207
7208       regno = reg_renumber[regno];
7209     }
7210   return GP_REGNUM_P (regno);
7211 }
7212
7213 /* Return true if register REGNO is a valid base register for mode MODE.
7214    STRICT_P is true if REG_OK_STRICT is in effect.  */
7215
7216 bool
7217 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
7218 {
7219   if (!HARD_REGISTER_NUM_P (regno))
7220     {
7221       if (!strict_p)
7222         return true;
7223
7224       if (!reg_renumber)
7225         return false;
7226
7227       regno = reg_renumber[regno];
7228     }
7229
7230   /* The fake registers will be eliminated to either the stack or
7231      hard frame pointer, both of which are usually valid base registers.
7232      Reload deals with the cases where the eliminated form isn't valid.  */
7233   return (GP_REGNUM_P (regno)
7234           || regno == SP_REGNUM
7235           || regno == FRAME_POINTER_REGNUM
7236           || regno == ARG_POINTER_REGNUM);
7237 }
7238
7239 /* Return true if X is a valid base register for mode MODE.
7240    STRICT_P is true if REG_OK_STRICT is in effect.  */
7241
7242 static bool
7243 aarch64_base_register_rtx_p (rtx x, bool strict_p)
7244 {
7245   if (!strict_p
7246       && GET_CODE (x) == SUBREG
7247       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
7248     x = SUBREG_REG (x);
7249
7250   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
7251 }
7252
7253 /* Return true if address offset is a valid index.  If it is, fill in INFO
7254    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
7255
7256 static bool
7257 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
7258                         machine_mode mode, bool strict_p)
7259 {
7260   enum aarch64_address_type type;
7261   rtx index;
7262   int shift;
7263
7264   /* (reg:P) */
7265   if ((REG_P (x) || GET_CODE (x) == SUBREG)
7266       && GET_MODE (x) == Pmode)
7267     {
7268       type = ADDRESS_REG_REG;
7269       index = x;
7270       shift = 0;
7271     }
7272   /* (sign_extend:DI (reg:SI)) */
7273   else if ((GET_CODE (x) == SIGN_EXTEND
7274             || GET_CODE (x) == ZERO_EXTEND)
7275            && GET_MODE (x) == DImode
7276            && GET_MODE (XEXP (x, 0)) == SImode)
7277     {
7278       type = (GET_CODE (x) == SIGN_EXTEND)
7279         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7280       index = XEXP (x, 0);
7281       shift = 0;
7282     }
7283   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
7284   else if (GET_CODE (x) == MULT
7285            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7286                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7287            && GET_MODE (XEXP (x, 0)) == DImode
7288            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7289            && CONST_INT_P (XEXP (x, 1)))
7290     {
7291       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7292         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7293       index = XEXP (XEXP (x, 0), 0);
7294       shift = exact_log2 (INTVAL (XEXP (x, 1)));
7295     }
7296   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
7297   else if (GET_CODE (x) == ASHIFT
7298            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7299                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7300            && GET_MODE (XEXP (x, 0)) == DImode
7301            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7302            && CONST_INT_P (XEXP (x, 1)))
7303     {
7304       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7305         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7306       index = XEXP (XEXP (x, 0), 0);
7307       shift = INTVAL (XEXP (x, 1));
7308     }
7309   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
7310   else if ((GET_CODE (x) == SIGN_EXTRACT
7311             || GET_CODE (x) == ZERO_EXTRACT)
7312            && GET_MODE (x) == DImode
7313            && GET_CODE (XEXP (x, 0)) == MULT
7314            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7315            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7316     {
7317       type = (GET_CODE (x) == SIGN_EXTRACT)
7318         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7319       index = XEXP (XEXP (x, 0), 0);
7320       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7321       if (INTVAL (XEXP (x, 1)) != 32 + shift
7322           || INTVAL (XEXP (x, 2)) != 0)
7323         shift = -1;
7324     }
7325   /* (and:DI (mult:DI (reg:DI) (const_int scale))
7326      (const_int 0xffffffff<<shift)) */
7327   else if (GET_CODE (x) == AND
7328            && GET_MODE (x) == DImode
7329            && GET_CODE (XEXP (x, 0)) == MULT
7330            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7331            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7332            && CONST_INT_P (XEXP (x, 1)))
7333     {
7334       type = ADDRESS_REG_UXTW;
7335       index = XEXP (XEXP (x, 0), 0);
7336       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7337       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7338         shift = -1;
7339     }
7340   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
7341   else if ((GET_CODE (x) == SIGN_EXTRACT
7342             || GET_CODE (x) == ZERO_EXTRACT)
7343            && GET_MODE (x) == DImode
7344            && GET_CODE (XEXP (x, 0)) == ASHIFT
7345            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7346            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7347     {
7348       type = (GET_CODE (x) == SIGN_EXTRACT)
7349         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7350       index = XEXP (XEXP (x, 0), 0);
7351       shift = INTVAL (XEXP (XEXP (x, 0), 1));
7352       if (INTVAL (XEXP (x, 1)) != 32 + shift
7353           || INTVAL (XEXP (x, 2)) != 0)
7354         shift = -1;
7355     }
7356   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
7357      (const_int 0xffffffff<<shift)) */
7358   else if (GET_CODE (x) == AND
7359            && GET_MODE (x) == DImode
7360            && GET_CODE (XEXP (x, 0)) == ASHIFT
7361            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7362            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7363            && CONST_INT_P (XEXP (x, 1)))
7364     {
7365       type = ADDRESS_REG_UXTW;
7366       index = XEXP (XEXP (x, 0), 0);
7367       shift = INTVAL (XEXP (XEXP (x, 0), 1));
7368       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7369         shift = -1;
7370     }
7371   /* (mult:P (reg:P) (const_int scale)) */
7372   else if (GET_CODE (x) == MULT
7373            && GET_MODE (x) == Pmode
7374            && GET_MODE (XEXP (x, 0)) == Pmode
7375            && CONST_INT_P (XEXP (x, 1)))
7376     {
7377       type = ADDRESS_REG_REG;
7378       index = XEXP (x, 0);
7379       shift = exact_log2 (INTVAL (XEXP (x, 1)));
7380     }
7381   /* (ashift:P (reg:P) (const_int shift)) */
7382   else if (GET_CODE (x) == ASHIFT
7383            && GET_MODE (x) == Pmode
7384            && GET_MODE (XEXP (x, 0)) == Pmode
7385            && CONST_INT_P (XEXP (x, 1)))
7386     {
7387       type = ADDRESS_REG_REG;
7388       index = XEXP (x, 0);
7389       shift = INTVAL (XEXP (x, 1));
7390     }
7391   else
7392     return false;
7393
7394   if (!strict_p
7395       && GET_CODE (index) == SUBREG
7396       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
7397     index = SUBREG_REG (index);
7398
7399   if (aarch64_sve_data_mode_p (mode))
7400     {
7401       if (type != ADDRESS_REG_REG
7402           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
7403         return false;
7404     }
7405   else
7406     {
7407       if (shift != 0
7408           && !(IN_RANGE (shift, 1, 3)
7409                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
7410         return false;
7411     }
7412
7413   if (REG_P (index)
7414       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
7415     {
7416       info->type = type;
7417       info->offset = index;
7418       info->shift = shift;
7419       return true;
7420     }
7421
7422   return false;
7423 }
7424
7425 /* Return true if MODE is one of the modes for which we
7426    support LDP/STP operations.  */
7427
7428 static bool
7429 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
7430 {
7431   return mode == SImode || mode == DImode
7432          || mode == SFmode || mode == DFmode
7433          || (aarch64_vector_mode_supported_p (mode)
7434              && (known_eq (GET_MODE_SIZE (mode), 8)
7435                  || (known_eq (GET_MODE_SIZE (mode), 16)
7436                     && (aarch64_tune_params.extra_tuning_flags
7437                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
7438 }
7439
7440 /* Return true if REGNO is a virtual pointer register, or an eliminable
7441    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
7442    include stack_pointer or hard_frame_pointer.  */
7443 static bool
7444 virt_or_elim_regno_p (unsigned regno)
7445 {
7446   return ((regno >= FIRST_VIRTUAL_REGISTER
7447            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
7448           || regno == FRAME_POINTER_REGNUM
7449           || regno == ARG_POINTER_REGNUM);
7450 }
7451
7452 /* Return true if X is a valid address of type TYPE for machine mode MODE.
7453    If it is, fill in INFO appropriately.  STRICT_P is true if
7454    REG_OK_STRICT is in effect.  */
7455
7456 bool
7457 aarch64_classify_address (struct aarch64_address_info *info,
7458                           rtx x, machine_mode mode, bool strict_p,
7459                           aarch64_addr_query_type type)
7460 {
7461   enum rtx_code code = GET_CODE (x);
7462   rtx op0, op1;
7463   poly_int64 offset;
7464
7465   HOST_WIDE_INT const_size;
7466
7467   /* On BE, we use load/store pair for all large int mode load/stores.
7468      TI/TFmode may also use a load/store pair.  */
7469   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7470   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
7471   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
7472                             || type == ADDR_QUERY_LDP_STP_N
7473                             || mode == TImode
7474                             || mode == TFmode
7475                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
7476
7477   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
7478      corresponds to the actual size of the memory being loaded/stored and the
7479      mode of the corresponding addressing mode is half of that.  */
7480   if (type == ADDR_QUERY_LDP_STP_N
7481       && known_eq (GET_MODE_SIZE (mode), 16))
7482     mode = DFmode;
7483
7484   bool allow_reg_index_p = (!load_store_pair_p
7485                             && (known_lt (GET_MODE_SIZE (mode), 16)
7486                                 || vec_flags == VEC_ADVSIMD
7487                                 || vec_flags & VEC_SVE_DATA));
7488
7489   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
7490      [Rn, #offset, MUL VL].  */
7491   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
7492       && (code != REG && code != PLUS))
7493     return false;
7494
7495   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
7496      REG addressing.  */
7497   if (advsimd_struct_p
7498       && !BYTES_BIG_ENDIAN
7499       && (code != POST_INC && code != REG))
7500     return false;
7501
7502   gcc_checking_assert (GET_MODE (x) == VOIDmode
7503                        || SCALAR_INT_MODE_P (GET_MODE (x)));
7504
7505   switch (code)
7506     {
7507     case REG:
7508     case SUBREG:
7509       info->type = ADDRESS_REG_IMM;
7510       info->base = x;
7511       info->offset = const0_rtx;
7512       info->const_offset = 0;
7513       return aarch64_base_register_rtx_p (x, strict_p);
7514
7515     case PLUS:
7516       op0 = XEXP (x, 0);
7517       op1 = XEXP (x, 1);
7518
7519       if (! strict_p
7520           && REG_P (op0)
7521           && virt_or_elim_regno_p (REGNO (op0))
7522           && poly_int_rtx_p (op1, &offset))
7523         {
7524           info->type = ADDRESS_REG_IMM;
7525           info->base = op0;
7526           info->offset = op1;
7527           info->const_offset = offset;
7528
7529           return true;
7530         }
7531
7532       if (maybe_ne (GET_MODE_SIZE (mode), 0)
7533           && aarch64_base_register_rtx_p (op0, strict_p)
7534           && poly_int_rtx_p (op1, &offset))
7535         {
7536           info->type = ADDRESS_REG_IMM;
7537           info->base = op0;
7538           info->offset = op1;
7539           info->const_offset = offset;
7540
7541           /* TImode and TFmode values are allowed in both pairs of X
7542              registers and individual Q registers.  The available
7543              address modes are:
7544              X,X: 7-bit signed scaled offset
7545              Q:   9-bit signed offset
7546              We conservatively require an offset representable in either mode.
7547              When performing the check for pairs of X registers i.e.  LDP/STP
7548              pass down DImode since that is the natural size of the LDP/STP
7549              instruction memory accesses.  */
7550           if (mode == TImode || mode == TFmode)
7551             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
7552                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7553                         || offset_12bit_unsigned_scaled_p (mode, offset)));
7554
7555           /* A 7bit offset check because OImode will emit a ldp/stp
7556              instruction (only big endian will get here).
7557              For ldp/stp instructions, the offset is scaled for the size of a
7558              single element of the pair.  */
7559           if (mode == OImode)
7560             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
7561
7562           /* Three 9/12 bit offsets checks because CImode will emit three
7563              ldr/str instructions (only big endian will get here).  */
7564           if (mode == CImode)
7565             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7566                     && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
7567                                                                offset + 32)
7568                         || offset_12bit_unsigned_scaled_p (V16QImode,
7569                                                            offset + 32)));
7570
7571           /* Two 7bit offsets checks because XImode will emit two ldp/stp
7572              instructions (only big endian will get here).  */
7573           if (mode == XImode)
7574             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7575                     && aarch64_offset_7bit_signed_scaled_p (TImode,
7576                                                             offset + 32));
7577
7578           /* Make "m" use the LD1 offset range for SVE data modes, so
7579              that pre-RTL optimizers like ivopts will work to that
7580              instead of the wider LDR/STR range.  */
7581           if (vec_flags == VEC_SVE_DATA)
7582             return (type == ADDR_QUERY_M
7583                     ? offset_4bit_signed_scaled_p (mode, offset)
7584                     : offset_9bit_signed_scaled_p (mode, offset));
7585
7586           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
7587             {
7588               poly_int64 end_offset = (offset
7589                                        + GET_MODE_SIZE (mode)
7590                                        - BYTES_PER_SVE_VECTOR);
7591               return (type == ADDR_QUERY_M
7592                       ? offset_4bit_signed_scaled_p (mode, offset)
7593                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
7594                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
7595                                                          end_offset)));
7596             }
7597
7598           if (vec_flags == VEC_SVE_PRED)
7599             return offset_9bit_signed_scaled_p (mode, offset);
7600
7601           if (load_store_pair_p)
7602             return ((known_eq (GET_MODE_SIZE (mode), 4)
7603                      || known_eq (GET_MODE_SIZE (mode), 8)
7604                      || known_eq (GET_MODE_SIZE (mode), 16))
7605                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7606           else
7607             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7608                     || offset_12bit_unsigned_scaled_p (mode, offset));
7609         }
7610
7611       if (allow_reg_index_p)
7612         {
7613           /* Look for base + (scaled/extended) index register.  */
7614           if (aarch64_base_register_rtx_p (op0, strict_p)
7615               && aarch64_classify_index (info, op1, mode, strict_p))
7616             {
7617               info->base = op0;
7618               return true;
7619             }
7620           if (aarch64_base_register_rtx_p (op1, strict_p)
7621               && aarch64_classify_index (info, op0, mode, strict_p))
7622             {
7623               info->base = op1;
7624               return true;
7625             }
7626         }
7627
7628       return false;
7629
7630     case POST_INC:
7631     case POST_DEC:
7632     case PRE_INC:
7633     case PRE_DEC:
7634       info->type = ADDRESS_REG_WB;
7635       info->base = XEXP (x, 0);
7636       info->offset = NULL_RTX;
7637       return aarch64_base_register_rtx_p (info->base, strict_p);
7638
7639     case POST_MODIFY:
7640     case PRE_MODIFY:
7641       info->type = ADDRESS_REG_WB;
7642       info->base = XEXP (x, 0);
7643       if (GET_CODE (XEXP (x, 1)) == PLUS
7644           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
7645           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
7646           && aarch64_base_register_rtx_p (info->base, strict_p))
7647         {
7648           info->offset = XEXP (XEXP (x, 1), 1);
7649           info->const_offset = offset;
7650
7651           /* TImode and TFmode values are allowed in both pairs of X
7652              registers and individual Q registers.  The available
7653              address modes are:
7654              X,X: 7-bit signed scaled offset
7655              Q:   9-bit signed offset
7656              We conservatively require an offset representable in either mode.
7657            */
7658           if (mode == TImode || mode == TFmode)
7659             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
7660                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
7661
7662           if (load_store_pair_p)
7663             return ((known_eq (GET_MODE_SIZE (mode), 4)
7664                      || known_eq (GET_MODE_SIZE (mode), 8)
7665                      || known_eq (GET_MODE_SIZE (mode), 16))
7666                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7667           else
7668             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
7669         }
7670       return false;
7671
7672     case CONST:
7673     case SYMBOL_REF:
7674     case LABEL_REF:
7675       /* load literal: pc-relative constant pool entry.  Only supported
7676          for SI mode or larger.  */
7677       info->type = ADDRESS_SYMBOLIC;
7678
7679       if (!load_store_pair_p
7680           && GET_MODE_SIZE (mode).is_constant (&const_size)
7681           && const_size >= 4)
7682         {
7683           rtx sym, addend;
7684
7685           split_const (x, &sym, &addend);
7686           return ((GET_CODE (sym) == LABEL_REF
7687                    || (GET_CODE (sym) == SYMBOL_REF
7688                        && CONSTANT_POOL_ADDRESS_P (sym)
7689                        && aarch64_pcrelative_literal_loads)));
7690         }
7691       return false;
7692
7693     case LO_SUM:
7694       info->type = ADDRESS_LO_SUM;
7695       info->base = XEXP (x, 0);
7696       info->offset = XEXP (x, 1);
7697       if (allow_reg_index_p
7698           && aarch64_base_register_rtx_p (info->base, strict_p))
7699         {
7700           rtx sym, offs;
7701           split_const (info->offset, &sym, &offs);
7702           if (GET_CODE (sym) == SYMBOL_REF
7703               && (aarch64_classify_symbol (sym, INTVAL (offs))
7704                   == SYMBOL_SMALL_ABSOLUTE))
7705             {
7706               /* The symbol and offset must be aligned to the access size.  */
7707               unsigned int align;
7708
7709               if (CONSTANT_POOL_ADDRESS_P (sym))
7710                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
7711               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
7712                 {
7713                   tree exp = SYMBOL_REF_DECL (sym);
7714                   align = TYPE_ALIGN (TREE_TYPE (exp));
7715                   align = aarch64_constant_alignment (exp, align);
7716                 }
7717               else if (SYMBOL_REF_DECL (sym))
7718                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
7719               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
7720                        && SYMBOL_REF_BLOCK (sym) != NULL)
7721                 align = SYMBOL_REF_BLOCK (sym)->alignment;
7722               else
7723                 align = BITS_PER_UNIT;
7724
7725               poly_int64 ref_size = GET_MODE_SIZE (mode);
7726               if (known_eq (ref_size, 0))
7727                 ref_size = GET_MODE_SIZE (DImode);
7728
7729               return (multiple_p (INTVAL (offs), ref_size)
7730                       && multiple_p (align / BITS_PER_UNIT, ref_size));
7731             }
7732         }
7733       return false;
7734
7735     default:
7736       return false;
7737     }
7738 }
7739
7740 /* Return true if the address X is valid for a PRFM instruction.
7741    STRICT_P is true if we should do strict checking with
7742    aarch64_classify_address.  */
7743
7744 bool
7745 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
7746 {
7747   struct aarch64_address_info addr;
7748
7749   /* PRFM accepts the same addresses as DImode...  */
7750   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
7751   if (!res)
7752     return false;
7753
7754   /* ... except writeback forms.  */
7755   return addr.type != ADDRESS_REG_WB;
7756 }
7757
7758 bool
7759 aarch64_symbolic_address_p (rtx x)
7760 {
7761   rtx offset;
7762
7763   split_const (x, &x, &offset);
7764   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
7765 }
7766
7767 /* Classify the base of symbolic expression X.  */
7768
7769 enum aarch64_symbol_type
7770 aarch64_classify_symbolic_expression (rtx x)
7771 {
7772   rtx offset;
7773
7774   split_const (x, &x, &offset);
7775   return aarch64_classify_symbol (x, INTVAL (offset));
7776 }
7777
7778
7779 /* Return TRUE if X is a legitimate address for accessing memory in
7780    mode MODE.  */
7781 static bool
7782 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
7783 {
7784   struct aarch64_address_info addr;
7785
7786   return aarch64_classify_address (&addr, x, mode, strict_p);
7787 }
7788
7789 /* Return TRUE if X is a legitimate address of type TYPE for accessing
7790    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
7791 bool
7792 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
7793                               aarch64_addr_query_type type)
7794 {
7795   struct aarch64_address_info addr;
7796
7797   return aarch64_classify_address (&addr, x, mode, strict_p, type);
7798 }
7799
7800 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
7801
7802 static bool
7803 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
7804                                          poly_int64 orig_offset,
7805                                          machine_mode mode)
7806 {
7807   HOST_WIDE_INT size;
7808   if (GET_MODE_SIZE (mode).is_constant (&size))
7809     {
7810       HOST_WIDE_INT const_offset, second_offset;
7811
7812       /* A general SVE offset is A * VQ + B.  Remove the A component from
7813          coefficient 0 in order to get the constant B.  */
7814       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
7815
7816       /* Split an out-of-range address displacement into a base and
7817          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
7818          range otherwise to increase opportunities for sharing the base
7819          address of different sizes.  Unaligned accesses use the signed
7820          9-bit range, TImode/TFmode use the intersection of signed
7821          scaled 7-bit and signed 9-bit offset.  */
7822       if (mode == TImode || mode == TFmode)
7823         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
7824       else if ((const_offset & (size - 1)) != 0)
7825         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
7826       else
7827         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
7828
7829       if (second_offset == 0 || known_eq (orig_offset, second_offset))
7830         return false;
7831
7832       /* Split the offset into second_offset and the rest.  */
7833       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7834       *offset2 = gen_int_mode (second_offset, Pmode);
7835       return true;
7836     }
7837   else
7838     {
7839       /* Get the mode we should use as the basis of the range.  For structure
7840          modes this is the mode of one vector.  */
7841       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7842       machine_mode step_mode
7843         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
7844
7845       /* Get the "mul vl" multiplier we'd like to use.  */
7846       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
7847       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
7848       if (vec_flags & VEC_SVE_DATA)
7849         /* LDR supports a 9-bit range, but the move patterns for
7850            structure modes require all vectors to be in range of the
7851            same base.  The simplest way of accomodating that while still
7852            promoting reuse of anchor points between different modes is
7853            to use an 8-bit range unconditionally.  */
7854         vnum = ((vnum + 128) & 255) - 128;
7855       else
7856         /* Predicates are only handled singly, so we might as well use
7857            the full range.  */
7858         vnum = ((vnum + 256) & 511) - 256;
7859       if (vnum == 0)
7860         return false;
7861
7862       /* Convert the "mul vl" multiplier into a byte offset.  */
7863       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
7864       if (known_eq (second_offset, orig_offset))
7865         return false;
7866
7867       /* Split the offset into second_offset and the rest.  */
7868       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7869       *offset2 = gen_int_mode (second_offset, Pmode);
7870       return true;
7871     }
7872 }
7873
7874 /* Return the binary representation of floating point constant VALUE in INTVAL.
7875    If the value cannot be converted, return false without setting INTVAL.
7876    The conversion is done in the given MODE.  */
7877 bool
7878 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
7879 {
7880
7881   /* We make a general exception for 0.  */
7882   if (aarch64_float_const_zero_rtx_p (value))
7883     {
7884       *intval = 0;
7885       return true;
7886     }
7887
7888   scalar_float_mode mode;
7889   if (GET_CODE (value) != CONST_DOUBLE
7890       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
7891       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
7892       /* Only support up to DF mode.  */
7893       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
7894     return false;
7895
7896   unsigned HOST_WIDE_INT ival = 0;
7897
7898   long res[2];
7899   real_to_target (res,
7900                   CONST_DOUBLE_REAL_VALUE (value),
7901                   REAL_MODE_FORMAT (mode));
7902
7903   if (mode == DFmode)
7904     {
7905       int order = BYTES_BIG_ENDIAN ? 1 : 0;
7906       ival = zext_hwi (res[order], 32);
7907       ival |= (zext_hwi (res[1 - order], 32) << 32);
7908     }
7909   else
7910       ival = zext_hwi (res[0], 32);
7911
7912   *intval = ival;
7913   return true;
7914 }
7915
7916 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7917    single MOV(+MOVK) followed by an FMOV.  */
7918 bool
7919 aarch64_float_const_rtx_p (rtx x)
7920 {
7921   machine_mode mode = GET_MODE (x);
7922   if (mode == VOIDmode)
7923     return false;
7924
7925   /* Determine whether it's cheaper to write float constants as
7926      mov/movk pairs over ldr/adrp pairs.  */
7927   unsigned HOST_WIDE_INT ival;
7928
7929   if (GET_CODE (x) == CONST_DOUBLE
7930       && SCALAR_FLOAT_MODE_P (mode)
7931       && aarch64_reinterpret_float_as_int (x, &ival))
7932     {
7933       scalar_int_mode imode = (mode == HFmode
7934                                ? SImode
7935                                : int_mode_for_mode (mode).require ());
7936       int num_instr = aarch64_internal_mov_immediate
7937                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7938       return num_instr < 3;
7939     }
7940
7941   return false;
7942 }
7943
7944 /* Return TRUE if rtx X is immediate constant 0.0 */
7945 bool
7946 aarch64_float_const_zero_rtx_p (rtx x)
7947 {
7948   if (GET_MODE (x) == VOIDmode)
7949     return false;
7950
7951   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7952     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7953   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7954 }
7955
7956 /* Return TRUE if rtx X is immediate constant that fits in a single
7957    MOVI immediate operation.  */
7958 bool
7959 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7960 {
7961   if (!TARGET_SIMD)
7962      return false;
7963
7964   machine_mode vmode;
7965   scalar_int_mode imode;
7966   unsigned HOST_WIDE_INT ival;
7967
7968   if (GET_CODE (x) == CONST_DOUBLE
7969       && SCALAR_FLOAT_MODE_P (mode))
7970     {
7971       if (!aarch64_reinterpret_float_as_int (x, &ival))
7972         return false;
7973
7974       /* We make a general exception for 0.  */
7975       if (aarch64_float_const_zero_rtx_p (x))
7976         return true;
7977
7978       imode = int_mode_for_mode (mode).require ();
7979     }
7980   else if (GET_CODE (x) == CONST_INT
7981            && is_a <scalar_int_mode> (mode, &imode))
7982     ival = INTVAL (x);
7983   else
7984     return false;
7985
7986    /* use a 64 bit mode for everything except for DI/DF mode, where we use
7987      a 128 bit vector mode.  */
7988   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7989
7990   vmode = aarch64_simd_container_mode (imode, width);
7991   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7992
7993   return aarch64_simd_valid_immediate (v_op, NULL);
7994 }
7995
7996
7997 /* Return the fixed registers used for condition codes.  */
7998
7999 static bool
8000 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
8001 {
8002   *p1 = CC_REGNUM;
8003   *p2 = INVALID_REGNUM;
8004   return true;
8005 }
8006
8007 /* This function is used by the call expanders of the machine description.
8008    RESULT is the register in which the result is returned.  It's NULL for
8009    "call" and "sibcall".
8010    MEM is the location of the function call.
8011    SIBCALL indicates whether this function call is normal call or sibling call.
8012    It will generate different pattern accordingly.  */
8013
8014 void
8015 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
8016 {
8017   rtx call, callee, tmp;
8018   rtvec vec;
8019   machine_mode mode;
8020
8021   gcc_assert (MEM_P (mem));
8022   callee = XEXP (mem, 0);
8023   mode = GET_MODE (callee);
8024   gcc_assert (mode == Pmode);
8025
8026   /* Decide if we should generate indirect calls by loading the
8027      address of the callee into a register before performing
8028      the branch-and-link.  */
8029   if (SYMBOL_REF_P (callee)
8030       ? (aarch64_is_long_call_p (callee)
8031          || aarch64_is_noplt_call_p (callee))
8032       : !REG_P (callee))
8033     XEXP (mem, 0) = force_reg (mode, callee);
8034
8035   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
8036
8037   if (result != NULL_RTX)
8038     call = gen_rtx_SET (result, call);
8039
8040   if (sibcall)
8041     tmp = ret_rtx;
8042   else
8043     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
8044
8045   vec = gen_rtvec (2, call, tmp);
8046   call = gen_rtx_PARALLEL (VOIDmode, vec);
8047
8048   aarch64_emit_call_insn (call);
8049 }
8050
8051 /* Emit call insn with PAT and do aarch64-specific handling.  */
8052
8053 void
8054 aarch64_emit_call_insn (rtx pat)
8055 {
8056   rtx insn = emit_call_insn (pat);
8057
8058   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
8059   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
8060   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
8061 }
8062
8063 machine_mode
8064 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
8065 {
8066   machine_mode mode_x = GET_MODE (x);
8067   rtx_code code_x = GET_CODE (x);
8068
8069   /* All floating point compares return CCFP if it is an equality
8070      comparison, and CCFPE otherwise.  */
8071   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
8072     {
8073       switch (code)
8074         {
8075         case EQ:
8076         case NE:
8077         case UNORDERED:
8078         case ORDERED:
8079         case UNLT:
8080         case UNLE:
8081         case UNGT:
8082         case UNGE:
8083         case UNEQ:
8084           return CCFPmode;
8085
8086         case LT:
8087         case LE:
8088         case GT:
8089         case GE:
8090         case LTGT:
8091           return CCFPEmode;
8092
8093         default:
8094           gcc_unreachable ();
8095         }
8096     }
8097
8098   /* Equality comparisons of short modes against zero can be performed
8099      using the TST instruction with the appropriate bitmask.  */
8100   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
8101       && (code == EQ || code == NE)
8102       && (mode_x == HImode || mode_x == QImode))
8103     return CC_NZmode;
8104
8105   /* Similarly, comparisons of zero_extends from shorter modes can
8106      be performed using an ANDS with an immediate mask.  */
8107   if (y == const0_rtx && code_x == ZERO_EXTEND
8108       && (mode_x == SImode || mode_x == DImode)
8109       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
8110       && (code == EQ || code == NE))
8111     return CC_NZmode;
8112
8113   if ((mode_x == SImode || mode_x == DImode)
8114       && y == const0_rtx
8115       && (code == EQ || code == NE || code == LT || code == GE)
8116       && (code_x == PLUS || code_x == MINUS || code_x == AND
8117           || code_x == NEG
8118           || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
8119               && CONST_INT_P (XEXP (x, 2)))))
8120     return CC_NZmode;
8121
8122   /* A compare with a shifted operand.  Because of canonicalization,
8123      the comparison will have to be swapped when we emit the assembly
8124      code.  */
8125   if ((mode_x == SImode || mode_x == DImode)
8126       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
8127       && (code_x == ASHIFT || code_x == ASHIFTRT
8128           || code_x == LSHIFTRT
8129           || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
8130     return CC_SWPmode;
8131
8132   /* Similarly for a negated operand, but we can only do this for
8133      equalities.  */
8134   if ((mode_x == SImode || mode_x == DImode)
8135       && (REG_P (y) || GET_CODE (y) == SUBREG)
8136       && (code == EQ || code == NE)
8137       && code_x == NEG)
8138     return CC_Zmode;
8139
8140   /* A test for unsigned overflow from an addition.  */
8141   if ((mode_x == DImode || mode_x == TImode)
8142       && (code == LTU || code == GEU)
8143       && code_x == PLUS
8144       && rtx_equal_p (XEXP (x, 0), y))
8145     return CC_Cmode;
8146
8147   /* A test for unsigned overflow from an add with carry.  */
8148   if ((mode_x == DImode || mode_x == TImode)
8149       && (code == LTU || code == GEU)
8150       && code_x == PLUS
8151       && CONST_SCALAR_INT_P (y)
8152       && (rtx_mode_t (y, mode_x)
8153           == (wi::shwi (1, mode_x)
8154               << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
8155     return CC_ADCmode;
8156
8157   /* A test for signed overflow.  */
8158   if ((mode_x == DImode || mode_x == TImode)
8159       && code == NE
8160       && code_x == PLUS
8161       && GET_CODE (y) == SIGN_EXTEND)
8162     return CC_Vmode;
8163
8164   /* For everything else, return CCmode.  */
8165   return CCmode;
8166 }
8167
8168 static int
8169 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
8170
8171 int
8172 aarch64_get_condition_code (rtx x)
8173 {
8174   machine_mode mode = GET_MODE (XEXP (x, 0));
8175   enum rtx_code comp_code = GET_CODE (x);
8176
8177   if (GET_MODE_CLASS (mode) != MODE_CC)
8178     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
8179   return aarch64_get_condition_code_1 (mode, comp_code);
8180 }
8181
8182 static int
8183 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
8184 {
8185   switch (mode)
8186     {
8187     case E_CCFPmode:
8188     case E_CCFPEmode:
8189       switch (comp_code)
8190         {
8191         case GE: return AARCH64_GE;
8192         case GT: return AARCH64_GT;
8193         case LE: return AARCH64_LS;
8194         case LT: return AARCH64_MI;
8195         case NE: return AARCH64_NE;
8196         case EQ: return AARCH64_EQ;
8197         case ORDERED: return AARCH64_VC;
8198         case UNORDERED: return AARCH64_VS;
8199         case UNLT: return AARCH64_LT;
8200         case UNLE: return AARCH64_LE;
8201         case UNGT: return AARCH64_HI;
8202         case UNGE: return AARCH64_PL;
8203         default: return -1;
8204         }
8205       break;
8206
8207     case E_CCmode:
8208       switch (comp_code)
8209         {
8210         case NE: return AARCH64_NE;
8211         case EQ: return AARCH64_EQ;
8212         case GE: return AARCH64_GE;
8213         case GT: return AARCH64_GT;
8214         case LE: return AARCH64_LE;
8215         case LT: return AARCH64_LT;
8216         case GEU: return AARCH64_CS;
8217         case GTU: return AARCH64_HI;
8218         case LEU: return AARCH64_LS;
8219         case LTU: return AARCH64_CC;
8220         default: return -1;
8221         }
8222       break;
8223
8224     case E_CC_SWPmode:
8225       switch (comp_code)
8226         {
8227         case NE: return AARCH64_NE;
8228         case EQ: return AARCH64_EQ;
8229         case GE: return AARCH64_LE;
8230         case GT: return AARCH64_LT;
8231         case LE: return AARCH64_GE;
8232         case LT: return AARCH64_GT;
8233         case GEU: return AARCH64_LS;
8234         case GTU: return AARCH64_CC;
8235         case LEU: return AARCH64_CS;
8236         case LTU: return AARCH64_HI;
8237         default: return -1;
8238         }
8239       break;
8240
8241     case E_CC_NZCmode:
8242       switch (comp_code)
8243         {
8244         case NE: return AARCH64_NE; /* = any */
8245         case EQ: return AARCH64_EQ; /* = none */
8246         case GE: return AARCH64_PL; /* = nfrst */
8247         case LT: return AARCH64_MI; /* = first */
8248         case GEU: return AARCH64_CS; /* = nlast */
8249         case GTU: return AARCH64_HI; /* = pmore */
8250         case LEU: return AARCH64_LS; /* = plast */
8251         case LTU: return AARCH64_CC; /* = last */
8252         default: return -1;
8253         }
8254       break;
8255
8256     case E_CC_NZmode:
8257       switch (comp_code)
8258         {
8259         case NE: return AARCH64_NE;
8260         case EQ: return AARCH64_EQ;
8261         case GE: return AARCH64_PL;
8262         case LT: return AARCH64_MI;
8263         default: return -1;
8264         }
8265       break;
8266
8267     case E_CC_Zmode:
8268       switch (comp_code)
8269         {
8270         case NE: return AARCH64_NE;
8271         case EQ: return AARCH64_EQ;
8272         default: return -1;
8273         }
8274       break;
8275
8276     case E_CC_Cmode:
8277       switch (comp_code)
8278         {
8279         case LTU: return AARCH64_CS;
8280         case GEU: return AARCH64_CC;
8281         default: return -1;
8282         }
8283       break;
8284
8285     case E_CC_ADCmode:
8286       switch (comp_code)
8287         {
8288         case GEU: return AARCH64_CS;
8289         case LTU: return AARCH64_CC;
8290         default: return -1;
8291         }
8292       break;
8293
8294     case E_CC_Vmode:
8295       switch (comp_code)
8296         {
8297         case NE: return AARCH64_VS;
8298         case EQ: return AARCH64_VC;
8299         default: return -1;
8300         }
8301       break;
8302
8303     default:
8304       return -1;
8305     }
8306
8307   return -1;
8308 }
8309
8310 bool
8311 aarch64_const_vec_all_same_in_range_p (rtx x,
8312                                        HOST_WIDE_INT minval,
8313                                        HOST_WIDE_INT maxval)
8314 {
8315   rtx elt;
8316   return (const_vec_duplicate_p (x, &elt)
8317           && CONST_INT_P (elt)
8318           && IN_RANGE (INTVAL (elt), minval, maxval));
8319 }
8320
8321 bool
8322 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
8323 {
8324   return aarch64_const_vec_all_same_in_range_p (x, val, val);
8325 }
8326
8327 /* Return true if VEC is a constant in which every element is in the range
8328    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
8329
8330 static bool
8331 aarch64_const_vec_all_in_range_p (rtx vec,
8332                                   HOST_WIDE_INT minval,
8333                                   HOST_WIDE_INT maxval)
8334 {
8335   if (GET_CODE (vec) != CONST_VECTOR
8336       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
8337     return false;
8338
8339   int nunits;
8340   if (!CONST_VECTOR_STEPPED_P (vec))
8341     nunits = const_vector_encoded_nelts (vec);
8342   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
8343     return false;
8344
8345   for (int i = 0; i < nunits; i++)
8346     {
8347       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
8348       if (!CONST_INT_P (vec_elem)
8349           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
8350         return false;
8351     }
8352   return true;
8353 }
8354
8355 /* N Z C V.  */
8356 #define AARCH64_CC_V 1
8357 #define AARCH64_CC_C (1 << 1)
8358 #define AARCH64_CC_Z (1 << 2)
8359 #define AARCH64_CC_N (1 << 3)
8360
8361 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
8362 static const int aarch64_nzcv_codes[] =
8363 {
8364   0,            /* EQ, Z == 1.  */
8365   AARCH64_CC_Z, /* NE, Z == 0.  */
8366   0,            /* CS, C == 1.  */
8367   AARCH64_CC_C, /* CC, C == 0.  */
8368   0,            /* MI, N == 1.  */
8369   AARCH64_CC_N, /* PL, N == 0.  */
8370   0,            /* VS, V == 1.  */
8371   AARCH64_CC_V, /* VC, V == 0.  */
8372   0,            /* HI, C ==1 && Z == 0.  */
8373   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
8374   AARCH64_CC_V, /* GE, N == V.  */
8375   0,            /* LT, N != V.  */
8376   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
8377   0,            /* LE, !(Z == 0 && N == V).  */
8378   0,            /* AL, Any.  */
8379   0             /* NV, Any.  */
8380 };
8381
8382 /* Print floating-point vector immediate operand X to F, negating it
8383    first if NEGATE is true.  Return true on success, false if it isn't
8384    a constant we can handle.  */
8385
8386 static bool
8387 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
8388 {
8389   rtx elt;
8390
8391   if (!const_vec_duplicate_p (x, &elt))
8392     return false;
8393
8394   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
8395   if (negate)
8396     r = real_value_negate (&r);
8397
8398   /* Handle the SVE single-bit immediates specially, since they have a
8399      fixed form in the assembly syntax.  */
8400   if (real_equal (&r, &dconst0))
8401     asm_fprintf (f, "0.0");
8402   else if (real_equal (&r, &dconst2))
8403     asm_fprintf (f, "2.0");
8404   else if (real_equal (&r, &dconst1))
8405     asm_fprintf (f, "1.0");
8406   else if (real_equal (&r, &dconsthalf))
8407     asm_fprintf (f, "0.5");
8408   else
8409     {
8410       const int buf_size = 20;
8411       char float_buf[buf_size] = {'\0'};
8412       real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
8413                                 1, GET_MODE (elt));
8414       asm_fprintf (f, "%s", float_buf);
8415     }
8416
8417   return true;
8418 }
8419
8420 /* Return the equivalent letter for size.  */
8421 static char
8422 sizetochar (int size)
8423 {
8424   switch (size)
8425     {
8426     case 64: return 'd';
8427     case 32: return 's';
8428     case 16: return 'h';
8429     case 8 : return 'b';
8430     default: gcc_unreachable ();
8431     }
8432 }
8433
8434 /* Print operand X to file F in a target specific manner according to CODE.
8435    The acceptable formatting commands given by CODE are:
8436      'c':               An integer or symbol address without a preceding #
8437                         sign.
8438      'C':               Take the duplicated element in a vector constant
8439                         and print it in hex.
8440      'D':               Take the duplicated element in a vector constant
8441                         and print it as an unsigned integer, in decimal.
8442      'e':               Print the sign/zero-extend size as a character 8->b,
8443                         16->h, 32->w.  Can also be used for masks:
8444                         0xff->b, 0xffff->h, 0xffffffff->w.
8445      'I':               If the operand is a duplicated vector constant,
8446                         replace it with the duplicated scalar.  If the
8447                         operand is then a floating-point constant, replace
8448                         it with the integer bit representation.  Print the
8449                         transformed constant as a signed decimal number.
8450      'p':               Prints N such that 2^N == X (X must be power of 2 and
8451                         const int).
8452      'P':               Print the number of non-zero bits in X (a const_int).
8453      'H':               Print the higher numbered register of a pair (TImode)
8454                         of regs.
8455      'm':               Print a condition (eq, ne, etc).
8456      'M':               Same as 'm', but invert condition.
8457      'N':               Take the duplicated element in a vector constant
8458                         and print the negative of it in decimal.
8459      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
8460      'S/T/U/V':         Print a FP/SIMD register name for a register list.
8461                         The register printed is the FP/SIMD register name
8462                         of X + 0/1/2/3 for S/T/U/V.
8463      'R':               Print a scalar Integer/FP/SIMD register name + 1.
8464      'X':               Print bottom 16 bits of integer constant in hex.
8465      'w/x':             Print a general register name or the zero register
8466                         (32-bit or 64-bit).
8467      '0':               Print a normal operand, if it's a general register,
8468                         then we assume DImode.
8469      'k':               Print NZCV for conditional compare instructions.
8470      'A':               Output address constant representing the first
8471                         argument of X, specifying a relocation offset
8472                         if appropriate.
8473      'L':               Output constant address specified by X
8474                         with a relocation offset if appropriate.
8475      'G':               Prints address of X, specifying a PC relative
8476                         relocation mode if appropriate.
8477      'y':               Output address of LDP or STP - this is used for
8478                         some LDP/STPs which don't use a PARALLEL in their
8479                         pattern (so the mode needs to be adjusted).
8480      'z':               Output address of a typical LDP or STP.  */
8481
8482 static void
8483 aarch64_print_operand (FILE *f, rtx x, int code)
8484 {
8485   rtx elt;
8486   switch (code)
8487     {
8488     case 'c':
8489       switch (GET_CODE (x))
8490         {
8491         case CONST_INT:
8492           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
8493           break;
8494
8495         case SYMBOL_REF:
8496           output_addr_const (f, x);
8497           break;
8498
8499         case CONST:
8500           if (GET_CODE (XEXP (x, 0)) == PLUS
8501               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
8502             {
8503               output_addr_const (f, x);
8504               break;
8505             }
8506           /* Fall through.  */
8507
8508         default:
8509           output_operand_lossage ("unsupported operand for code '%c'", code);
8510         }
8511       break;
8512
8513     case 'e':
8514       {
8515         x = unwrap_const_vec_duplicate (x);
8516         if (!CONST_INT_P (x))
8517           {
8518             output_operand_lossage ("invalid operand for '%%%c'", code);
8519             return;
8520           }
8521
8522         HOST_WIDE_INT val = INTVAL (x);
8523         if ((val & ~7) == 8 || val == 0xff)
8524           fputc ('b', f);
8525         else if ((val & ~7) == 16 || val == 0xffff)
8526           fputc ('h', f);
8527         else if ((val & ~7) == 32 || val == 0xffffffff)
8528           fputc ('w', f);
8529         else
8530           {
8531             output_operand_lossage ("invalid operand for '%%%c'", code);
8532             return;
8533           }
8534       }
8535       break;
8536
8537     case 'p':
8538       {
8539         int n;
8540
8541         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
8542           {
8543             output_operand_lossage ("invalid operand for '%%%c'", code);
8544             return;
8545           }
8546
8547         asm_fprintf (f, "%d", n);
8548       }
8549       break;
8550
8551     case 'P':
8552       if (!CONST_INT_P (x))
8553         {
8554           output_operand_lossage ("invalid operand for '%%%c'", code);
8555           return;
8556         }
8557
8558       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
8559       break;
8560
8561     case 'H':
8562       if (x == const0_rtx)
8563         {
8564           asm_fprintf (f, "xzr");
8565           break;
8566         }
8567
8568       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
8569         {
8570           output_operand_lossage ("invalid operand for '%%%c'", code);
8571           return;
8572         }
8573
8574       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
8575       break;
8576
8577     case 'I':
8578       {
8579         x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
8580         if (CONST_INT_P (x))
8581           asm_fprintf (f, "%wd", INTVAL (x));
8582         else
8583           {
8584             output_operand_lossage ("invalid operand for '%%%c'", code);
8585             return;
8586           }
8587         break;
8588       }
8589
8590     case 'M':
8591     case 'm':
8592       {
8593         int cond_code;
8594         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
8595         if (x == const_true_rtx)
8596           {
8597             if (code == 'M')
8598               fputs ("nv", f);
8599             return;
8600           }
8601
8602         if (!COMPARISON_P (x))
8603           {
8604             output_operand_lossage ("invalid operand for '%%%c'", code);
8605             return;
8606           }
8607
8608         cond_code = aarch64_get_condition_code (x);
8609         gcc_assert (cond_code >= 0);
8610         if (code == 'M')
8611           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
8612         if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
8613           fputs (aarch64_sve_condition_codes[cond_code], f);
8614         else
8615           fputs (aarch64_condition_codes[cond_code], f);
8616       }
8617       break;
8618
8619     case 'N':
8620       if (!const_vec_duplicate_p (x, &elt))
8621         {
8622           output_operand_lossage ("invalid vector constant");
8623           return;
8624         }
8625
8626       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8627         asm_fprintf (f, "%wd", -INTVAL (elt));
8628       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8629                && aarch64_print_vector_float_operand (f, x, true))
8630         ;
8631       else
8632         {
8633           output_operand_lossage ("invalid vector constant");
8634           return;
8635         }
8636       break;
8637
8638     case 'b':
8639     case 'h':
8640     case 's':
8641     case 'd':
8642     case 'q':
8643       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8644         {
8645           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8646           return;
8647         }
8648       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
8649       break;
8650
8651     case 'S':
8652     case 'T':
8653     case 'U':
8654     case 'V':
8655       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8656         {
8657           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8658           return;
8659         }
8660       asm_fprintf (f, "%c%d",
8661                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
8662                    REGNO (x) - V0_REGNUM + (code - 'S'));
8663       break;
8664
8665     case 'R':
8666       if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
8667         asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
8668       else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
8669         asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
8670       else
8671         output_operand_lossage ("incompatible register operand for '%%%c'",
8672                                 code);
8673       break;
8674
8675     case 'X':
8676       if (!CONST_INT_P (x))
8677         {
8678           output_operand_lossage ("invalid operand for '%%%c'", code);
8679           return;
8680         }
8681       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
8682       break;
8683
8684     case 'C':
8685       {
8686         /* Print a replicated constant in hex.  */
8687         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8688           {
8689             output_operand_lossage ("invalid operand for '%%%c'", code);
8690             return;
8691           }
8692         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8693         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8694       }
8695       break;
8696
8697     case 'D':
8698       {
8699         /* Print a replicated constant in decimal, treating it as
8700            unsigned.  */
8701         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8702           {
8703             output_operand_lossage ("invalid operand for '%%%c'", code);
8704             return;
8705           }
8706         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8707         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8708       }
8709       break;
8710
8711     case 'w':
8712     case 'x':
8713       if (x == const0_rtx
8714           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
8715         {
8716           asm_fprintf (f, "%czr", code);
8717           break;
8718         }
8719
8720       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
8721         {
8722           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
8723           break;
8724         }
8725
8726       if (REG_P (x) && REGNO (x) == SP_REGNUM)
8727         {
8728           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
8729           break;
8730         }
8731
8732       /* Fall through */
8733
8734     case 0:
8735       if (x == NULL)
8736         {
8737           output_operand_lossage ("missing operand");
8738           return;
8739         }
8740
8741       switch (GET_CODE (x))
8742         {
8743         case REG:
8744           if (aarch64_sve_data_mode_p (GET_MODE (x)))
8745             {
8746               if (REG_NREGS (x) == 1)
8747                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
8748               else
8749                 {
8750                   char suffix
8751                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
8752                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
8753                                REGNO (x) - V0_REGNUM, suffix,
8754                                END_REGNO (x) - V0_REGNUM - 1, suffix);
8755                 }
8756             }
8757           else
8758             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
8759           break;
8760
8761         case MEM:
8762           output_address (GET_MODE (x), XEXP (x, 0));
8763           break;
8764
8765         case LABEL_REF:
8766         case SYMBOL_REF:
8767           output_addr_const (asm_out_file, x);
8768           break;
8769
8770         case CONST_INT:
8771           asm_fprintf (f, "%wd", INTVAL (x));
8772           break;
8773
8774         case CONST:
8775           if (!VECTOR_MODE_P (GET_MODE (x)))
8776             {
8777               output_addr_const (asm_out_file, x);
8778               break;
8779             }
8780           /* fall through */
8781
8782         case CONST_VECTOR:
8783           if (!const_vec_duplicate_p (x, &elt))
8784             {
8785               output_operand_lossage ("invalid vector constant");
8786               return;
8787             }
8788
8789           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8790             asm_fprintf (f, "%wd", INTVAL (elt));
8791           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8792                    && aarch64_print_vector_float_operand (f, x, false))
8793             ;
8794           else
8795             {
8796               output_operand_lossage ("invalid vector constant");
8797               return;
8798             }
8799           break;
8800
8801         case CONST_DOUBLE:
8802           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
8803              be getting CONST_DOUBLEs holding integers.  */
8804           gcc_assert (GET_MODE (x) != VOIDmode);
8805           if (aarch64_float_const_zero_rtx_p (x))
8806             {
8807               fputc ('0', f);
8808               break;
8809             }
8810           else if (aarch64_float_const_representable_p (x))
8811             {
8812 #define buf_size 20
8813               char float_buf[buf_size] = {'\0'};
8814               real_to_decimal_for_mode (float_buf,
8815                                         CONST_DOUBLE_REAL_VALUE (x),
8816                                         buf_size, buf_size,
8817                                         1, GET_MODE (x));
8818               asm_fprintf (asm_out_file, "%s", float_buf);
8819               break;
8820 #undef buf_size
8821             }
8822           output_operand_lossage ("invalid constant");
8823           return;
8824         default:
8825           output_operand_lossage ("invalid operand");
8826           return;
8827         }
8828       break;
8829
8830     case 'A':
8831       if (GET_CODE (x) == HIGH)
8832         x = XEXP (x, 0);
8833
8834       switch (aarch64_classify_symbolic_expression (x))
8835         {
8836         case SYMBOL_SMALL_GOT_4G:
8837           asm_fprintf (asm_out_file, ":got:");
8838           break;
8839
8840         case SYMBOL_SMALL_TLSGD:
8841           asm_fprintf (asm_out_file, ":tlsgd:");
8842           break;
8843
8844         case SYMBOL_SMALL_TLSDESC:
8845           asm_fprintf (asm_out_file, ":tlsdesc:");
8846           break;
8847
8848         case SYMBOL_SMALL_TLSIE:
8849           asm_fprintf (asm_out_file, ":gottprel:");
8850           break;
8851
8852         case SYMBOL_TLSLE24:
8853           asm_fprintf (asm_out_file, ":tprel:");
8854           break;
8855
8856         case SYMBOL_TINY_GOT:
8857           gcc_unreachable ();
8858           break;
8859
8860         default:
8861           break;
8862         }
8863       output_addr_const (asm_out_file, x);
8864       break;
8865
8866     case 'L':
8867       switch (aarch64_classify_symbolic_expression (x))
8868         {
8869         case SYMBOL_SMALL_GOT_4G:
8870           asm_fprintf (asm_out_file, ":lo12:");
8871           break;
8872
8873         case SYMBOL_SMALL_TLSGD:
8874           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
8875           break;
8876
8877         case SYMBOL_SMALL_TLSDESC:
8878           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
8879           break;
8880
8881         case SYMBOL_SMALL_TLSIE:
8882           asm_fprintf (asm_out_file, ":gottprel_lo12:");
8883           break;
8884
8885         case SYMBOL_TLSLE12:
8886           asm_fprintf (asm_out_file, ":tprel_lo12:");
8887           break;
8888
8889         case SYMBOL_TLSLE24:
8890           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
8891           break;
8892
8893         case SYMBOL_TINY_GOT:
8894           asm_fprintf (asm_out_file, ":got:");
8895           break;
8896
8897         case SYMBOL_TINY_TLSIE:
8898           asm_fprintf (asm_out_file, ":gottprel:");
8899           break;
8900
8901         default:
8902           break;
8903         }
8904       output_addr_const (asm_out_file, x);
8905       break;
8906
8907     case 'G':
8908       switch (aarch64_classify_symbolic_expression (x))
8909         {
8910         case SYMBOL_TLSLE24:
8911           asm_fprintf (asm_out_file, ":tprel_hi12:");
8912           break;
8913         default:
8914           break;
8915         }
8916       output_addr_const (asm_out_file, x);
8917       break;
8918
8919     case 'k':
8920       {
8921         HOST_WIDE_INT cond_code;
8922
8923         if (!CONST_INT_P (x))
8924           {
8925             output_operand_lossage ("invalid operand for '%%%c'", code);
8926             return;
8927           }
8928
8929         cond_code = INTVAL (x);
8930         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
8931         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
8932       }
8933       break;
8934
8935     case 'y':
8936     case 'z':
8937       {
8938         machine_mode mode = GET_MODE (x);
8939
8940         if (GET_CODE (x) != MEM
8941             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
8942           {
8943             output_operand_lossage ("invalid operand for '%%%c'", code);
8944             return;
8945           }
8946
8947         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
8948                                             code == 'y'
8949                                             ? ADDR_QUERY_LDP_STP_N
8950                                             : ADDR_QUERY_LDP_STP))
8951           output_operand_lossage ("invalid operand prefix '%%%c'", code);
8952       }
8953       break;
8954
8955     default:
8956       output_operand_lossage ("invalid operand prefix '%%%c'", code);
8957       return;
8958     }
8959 }
8960
8961 /* Print address 'x' of a memory access with mode 'mode'.
8962    'op' is the context required by aarch64_classify_address.  It can either be
8963    MEM for a normal memory access or PARALLEL for LDP/STP.  */
8964 static bool
8965 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
8966                                 aarch64_addr_query_type type)
8967 {
8968   struct aarch64_address_info addr;
8969   unsigned int size;
8970
8971   /* Check all addresses are Pmode - including ILP32.  */
8972   if (GET_MODE (x) != Pmode
8973       && (!CONST_INT_P (x)
8974           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
8975     {
8976       output_operand_lossage ("invalid address mode");
8977       return false;
8978     }
8979
8980   if (aarch64_classify_address (&addr, x, mode, true, type))
8981     switch (addr.type)
8982       {
8983       case ADDRESS_REG_IMM:
8984         if (known_eq (addr.const_offset, 0))
8985           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8986         else if (aarch64_sve_data_mode_p (mode))
8987           {
8988             HOST_WIDE_INT vnum
8989               = exact_div (addr.const_offset,
8990                            BYTES_PER_SVE_VECTOR).to_constant ();
8991             asm_fprintf (f, "[%s, #%wd, mul vl]",
8992                          reg_names[REGNO (addr.base)], vnum);
8993           }
8994         else if (aarch64_sve_pred_mode_p (mode))
8995           {
8996             HOST_WIDE_INT vnum
8997               = exact_div (addr.const_offset,
8998                            BYTES_PER_SVE_PRED).to_constant ();
8999             asm_fprintf (f, "[%s, #%wd, mul vl]",
9000                          reg_names[REGNO (addr.base)], vnum);
9001           }
9002         else
9003           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
9004                        INTVAL (addr.offset));
9005         return true;
9006
9007       case ADDRESS_REG_REG:
9008         if (addr.shift == 0)
9009           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
9010                        reg_names [REGNO (addr.offset)]);
9011         else
9012           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
9013                        reg_names [REGNO (addr.offset)], addr.shift);
9014         return true;
9015
9016       case ADDRESS_REG_UXTW:
9017         if (addr.shift == 0)
9018           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
9019                        REGNO (addr.offset) - R0_REGNUM);
9020         else
9021           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
9022                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
9023         return true;
9024
9025       case ADDRESS_REG_SXTW:
9026         if (addr.shift == 0)
9027           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
9028                        REGNO (addr.offset) - R0_REGNUM);
9029         else
9030           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
9031                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
9032         return true;
9033
9034       case ADDRESS_REG_WB:
9035         /* Writeback is only supported for fixed-width modes.  */
9036         size = GET_MODE_SIZE (mode).to_constant ();
9037         switch (GET_CODE (x))
9038           {
9039           case PRE_INC:
9040             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
9041             return true;
9042           case POST_INC:
9043             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
9044             return true;
9045           case PRE_DEC:
9046             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
9047             return true;
9048           case POST_DEC:
9049             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
9050             return true;
9051           case PRE_MODIFY:
9052             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
9053                          INTVAL (addr.offset));
9054             return true;
9055           case POST_MODIFY:
9056             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
9057                          INTVAL (addr.offset));
9058             return true;
9059           default:
9060             break;
9061           }
9062         break;
9063
9064       case ADDRESS_LO_SUM:
9065         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
9066         output_addr_const (f, addr.offset);
9067         asm_fprintf (f, "]");
9068         return true;
9069
9070       case ADDRESS_SYMBOLIC:
9071         output_addr_const (f, x);
9072         return true;
9073       }
9074
9075   return false;
9076 }
9077
9078 /* Print address 'x' of a memory access with mode 'mode'.  */
9079 static void
9080 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
9081 {
9082   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
9083     output_addr_const (f, x);
9084 }
9085
9086 bool
9087 aarch64_label_mentioned_p (rtx x)
9088 {
9089   const char *fmt;
9090   int i;
9091
9092   if (GET_CODE (x) == LABEL_REF)
9093     return true;
9094
9095   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
9096      referencing instruction, but they are constant offsets, not
9097      symbols.  */
9098   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
9099     return false;
9100
9101   fmt = GET_RTX_FORMAT (GET_CODE (x));
9102   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
9103     {
9104       if (fmt[i] == 'E')
9105         {
9106           int j;
9107
9108           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
9109             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
9110               return 1;
9111         }
9112       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
9113         return 1;
9114     }
9115
9116   return 0;
9117 }
9118
9119 /* Implement REGNO_REG_CLASS.  */
9120
9121 enum reg_class
9122 aarch64_regno_regclass (unsigned regno)
9123 {
9124   if (GP_REGNUM_P (regno))
9125     return GENERAL_REGS;
9126
9127   if (regno == SP_REGNUM)
9128     return STACK_REG;
9129
9130   if (regno == FRAME_POINTER_REGNUM
9131       || regno == ARG_POINTER_REGNUM)
9132     return POINTER_REGS;
9133
9134   if (FP_REGNUM_P (regno))
9135     return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
9136             : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
9137
9138   if (PR_REGNUM_P (regno))
9139     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
9140
9141   return NO_REGS;
9142 }
9143
9144 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
9145    If OFFSET is out of range, return an offset of an anchor point
9146    that is in range.  Return 0 otherwise.  */
9147
9148 static HOST_WIDE_INT
9149 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
9150                        machine_mode mode)
9151 {
9152   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
9153   if (size > 16)
9154     return (offset + 0x400) & ~0x7f0;
9155
9156   /* For offsets that aren't a multiple of the access size, the limit is
9157      -256...255.  */
9158   if (offset & (size - 1))
9159     {
9160       /* BLKmode typically uses LDP of X-registers.  */
9161       if (mode == BLKmode)
9162         return (offset + 512) & ~0x3ff;
9163       return (offset + 0x100) & ~0x1ff;
9164     }
9165
9166   /* Small negative offsets are supported.  */
9167   if (IN_RANGE (offset, -256, 0))
9168     return 0;
9169
9170   if (mode == TImode || mode == TFmode)
9171     return (offset + 0x100) & ~0x1ff;
9172
9173   /* Use 12-bit offset by access size.  */
9174   return offset & (~0xfff * size);
9175 }
9176
9177 static rtx
9178 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
9179 {
9180   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
9181      where mask is selected by alignment and size of the offset.
9182      We try to pick as large a range for the offset as possible to
9183      maximize the chance of a CSE.  However, for aligned addresses
9184      we limit the range to 4k so that structures with different sized
9185      elements are likely to use the same base.  We need to be careful
9186      not to split a CONST for some forms of address expression, otherwise
9187      it will generate sub-optimal code.  */
9188
9189   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
9190     {
9191       rtx base = XEXP (x, 0);
9192       rtx offset_rtx = XEXP (x, 1);
9193       HOST_WIDE_INT offset = INTVAL (offset_rtx);
9194
9195       if (GET_CODE (base) == PLUS)
9196         {
9197           rtx op0 = XEXP (base, 0);
9198           rtx op1 = XEXP (base, 1);
9199
9200           /* Force any scaling into a temp for CSE.  */
9201           op0 = force_reg (Pmode, op0);
9202           op1 = force_reg (Pmode, op1);
9203
9204           /* Let the pointer register be in op0.  */
9205           if (REG_POINTER (op1))
9206             std::swap (op0, op1);
9207
9208           /* If the pointer is virtual or frame related, then we know that
9209              virtual register instantiation or register elimination is going
9210              to apply a second constant.  We want the two constants folded
9211              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
9212           if (virt_or_elim_regno_p (REGNO (op0)))
9213             {
9214               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
9215                                    NULL_RTX, true, OPTAB_DIRECT);
9216               return gen_rtx_PLUS (Pmode, base, op1);
9217             }
9218
9219           /* Otherwise, in order to encourage CSE (and thence loop strength
9220              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
9221           base = expand_binop (Pmode, add_optab, op0, op1,
9222                                NULL_RTX, true, OPTAB_DIRECT);
9223           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
9224         }
9225
9226       HOST_WIDE_INT size;
9227       if (GET_MODE_SIZE (mode).is_constant (&size))
9228         {
9229           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
9230                                                              mode);
9231           if (base_offset != 0)
9232             {
9233               base = plus_constant (Pmode, base, base_offset);
9234               base = force_operand (base, NULL_RTX);
9235               return plus_constant (Pmode, base, offset - base_offset);
9236             }
9237         }
9238     }
9239
9240   return x;
9241 }
9242
9243 static reg_class_t
9244 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
9245                           reg_class_t rclass,
9246                           machine_mode mode,
9247                           secondary_reload_info *sri)
9248 {
9249   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
9250      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
9251      comment at the head of aarch64-sve.md for more details about the
9252      big-endian handling.  */
9253   if (BYTES_BIG_ENDIAN
9254       && reg_class_subset_p (rclass, FP_REGS)
9255       && !((REG_P (x) && HARD_REGISTER_P (x))
9256            || aarch64_simd_valid_immediate (x, NULL))
9257       && aarch64_sve_data_mode_p (mode))
9258     {
9259       sri->icode = CODE_FOR_aarch64_sve_reload_be;
9260       return NO_REGS;
9261     }
9262
9263   /* If we have to disable direct literal pool loads and stores because the
9264      function is too big, then we need a scratch register.  */
9265   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
9266       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
9267           || targetm.vector_mode_supported_p (GET_MODE (x)))
9268       && !aarch64_pcrelative_literal_loads)
9269     {
9270       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
9271       return NO_REGS;
9272     }
9273
9274   /* Without the TARGET_SIMD instructions we cannot move a Q register
9275      to a Q register directly.  We need a scratch.  */
9276   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
9277       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
9278       && reg_class_subset_p (rclass, FP_REGS))
9279     {
9280       sri->icode = code_for_aarch64_reload_mov (mode);
9281       return NO_REGS;
9282     }
9283
9284   /* A TFmode or TImode memory access should be handled via an FP_REGS
9285      because AArch64 has richer addressing modes for LDR/STR instructions
9286      than LDP/STP instructions.  */
9287   if (TARGET_FLOAT && rclass == GENERAL_REGS
9288       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
9289     return FP_REGS;
9290
9291   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
9292       return GENERAL_REGS;
9293
9294   return NO_REGS;
9295 }
9296
9297 static bool
9298 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
9299 {
9300   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
9301
9302   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
9303      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
9304   if (frame_pointer_needed)
9305     return to == HARD_FRAME_POINTER_REGNUM;
9306   return true;
9307 }
9308
9309 poly_int64
9310 aarch64_initial_elimination_offset (unsigned from, unsigned to)
9311 {
9312   if (to == HARD_FRAME_POINTER_REGNUM)
9313     {
9314       if (from == ARG_POINTER_REGNUM)
9315         return cfun->machine->frame.hard_fp_offset;
9316
9317       if (from == FRAME_POINTER_REGNUM)
9318         return cfun->machine->frame.hard_fp_offset
9319                - cfun->machine->frame.locals_offset;
9320     }
9321
9322   if (to == STACK_POINTER_REGNUM)
9323     {
9324       if (from == FRAME_POINTER_REGNUM)
9325           return cfun->machine->frame.frame_size
9326                  - cfun->machine->frame.locals_offset;
9327     }
9328
9329   return cfun->machine->frame.frame_size;
9330 }
9331
9332 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
9333    previous frame.  */
9334
9335 rtx
9336 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
9337 {
9338   if (count != 0)
9339     return const0_rtx;
9340   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
9341 }
9342
9343
9344 static void
9345 aarch64_asm_trampoline_template (FILE *f)
9346 {
9347   int offset1 = 16;
9348   int offset2 = 20;
9349
9350   if (aarch64_bti_enabled ())
9351     {
9352       asm_fprintf (f, "\thint\t34 // bti c\n");
9353       offset1 -= 4;
9354       offset2 -= 4;
9355     }
9356
9357   if (TARGET_ILP32)
9358     {
9359       asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
9360       asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
9361                    offset1);
9362     }
9363   else
9364     {
9365       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
9366       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
9367                    offset2);
9368     }
9369   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
9370
9371   /* The trampoline needs an extra padding instruction.  In case if BTI is
9372      enabled the padding instruction is replaced by the BTI instruction at
9373      the beginning.  */
9374   if (!aarch64_bti_enabled ())
9375     assemble_aligned_integer (4, const0_rtx);
9376
9377   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9378   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9379 }
9380
9381 static void
9382 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
9383 {
9384   rtx fnaddr, mem, a_tramp;
9385   const int tramp_code_sz = 16;
9386
9387   /* Don't need to copy the trailing D-words, we fill those in below.  */
9388   emit_block_move (m_tramp, assemble_trampoline_template (),
9389                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
9390   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
9391   fnaddr = XEXP (DECL_RTL (fndecl), 0);
9392   if (GET_MODE (fnaddr) != ptr_mode)
9393     fnaddr = convert_memory_address (ptr_mode, fnaddr);
9394   emit_move_insn (mem, fnaddr);
9395
9396   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
9397   emit_move_insn (mem, chain_value);
9398
9399   /* XXX We should really define a "clear_cache" pattern and use
9400      gen_clear_cache().  */
9401   a_tramp = XEXP (m_tramp, 0);
9402   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
9403                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
9404                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
9405                      ptr_mode);
9406 }
9407
9408 static unsigned char
9409 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
9410 {
9411   /* ??? Logically we should only need to provide a value when
9412      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
9413      can hold MODE, but at the moment we need to handle all modes.
9414      Just ignore any runtime parts for registers that can't store them.  */
9415   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
9416   unsigned int nregs;
9417   switch (regclass)
9418     {
9419     case TAILCALL_ADDR_REGS:
9420     case POINTER_REGS:
9421     case GENERAL_REGS:
9422     case ALL_REGS:
9423     case POINTER_AND_FP_REGS:
9424     case FP_REGS:
9425     case FP_LO_REGS:
9426     case FP_LO8_REGS:
9427       if (aarch64_sve_data_mode_p (mode)
9428           && constant_multiple_p (GET_MODE_SIZE (mode),
9429                                   BYTES_PER_SVE_VECTOR, &nregs))
9430         return nregs;
9431       return (aarch64_vector_data_mode_p (mode)
9432               ? CEIL (lowest_size, UNITS_PER_VREG)
9433               : CEIL (lowest_size, UNITS_PER_WORD));
9434     case STACK_REG:
9435     case PR_REGS:
9436     case PR_LO_REGS:
9437     case PR_HI_REGS:
9438       return 1;
9439
9440     case NO_REGS:
9441       return 0;
9442
9443     default:
9444       break;
9445     }
9446   gcc_unreachable ();
9447 }
9448
9449 static reg_class_t
9450 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
9451 {
9452   if (regclass == POINTER_REGS)
9453     return GENERAL_REGS;
9454
9455   if (regclass == STACK_REG)
9456     {
9457       if (REG_P(x)
9458           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
9459           return regclass;
9460
9461       return NO_REGS;
9462     }
9463
9464   /* Register eliminiation can result in a request for
9465      SP+constant->FP_REGS.  We cannot support such operations which
9466      use SP as source and an FP_REG as destination, so reject out
9467      right now.  */
9468   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
9469     {
9470       rtx lhs = XEXP (x, 0);
9471
9472       /* Look through a possible SUBREG introduced by ILP32.  */
9473       if (GET_CODE (lhs) == SUBREG)
9474         lhs = SUBREG_REG (lhs);
9475
9476       gcc_assert (REG_P (lhs));
9477       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
9478                                       POINTER_REGS));
9479       return NO_REGS;
9480     }
9481
9482   return regclass;
9483 }
9484
9485 void
9486 aarch64_asm_output_labelref (FILE* f, const char *name)
9487 {
9488   asm_fprintf (f, "%U%s", name);
9489 }
9490
9491 static void
9492 aarch64_elf_asm_constructor (rtx symbol, int priority)
9493 {
9494   if (priority == DEFAULT_INIT_PRIORITY)
9495     default_ctor_section_asm_out_constructor (symbol, priority);
9496   else
9497     {
9498       section *s;
9499       /* While priority is known to be in range [0, 65535], so 18 bytes
9500          would be enough, the compiler might not know that.  To avoid
9501          -Wformat-truncation false positive, use a larger size.  */
9502       char buf[23];
9503       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
9504       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9505       switch_to_section (s);
9506       assemble_align (POINTER_SIZE);
9507       assemble_aligned_integer (POINTER_BYTES, symbol);
9508     }
9509 }
9510
9511 static void
9512 aarch64_elf_asm_destructor (rtx symbol, int priority)
9513 {
9514   if (priority == DEFAULT_INIT_PRIORITY)
9515     default_dtor_section_asm_out_destructor (symbol, priority);
9516   else
9517     {
9518       section *s;
9519       /* While priority is known to be in range [0, 65535], so 18 bytes
9520          would be enough, the compiler might not know that.  To avoid
9521          -Wformat-truncation false positive, use a larger size.  */
9522       char buf[23];
9523       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
9524       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9525       switch_to_section (s);
9526       assemble_align (POINTER_SIZE);
9527       assemble_aligned_integer (POINTER_BYTES, symbol);
9528     }
9529 }
9530
9531 const char*
9532 aarch64_output_casesi (rtx *operands)
9533 {
9534   char buf[100];
9535   char label[100];
9536   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
9537   int index;
9538   static const char *const patterns[4][2] =
9539   {
9540     {
9541       "ldrb\t%w3, [%0,%w1,uxtw]",
9542       "add\t%3, %4, %w3, sxtb #2"
9543     },
9544     {
9545       "ldrh\t%w3, [%0,%w1,uxtw #1]",
9546       "add\t%3, %4, %w3, sxth #2"
9547     },
9548     {
9549       "ldr\t%w3, [%0,%w1,uxtw #2]",
9550       "add\t%3, %4, %w3, sxtw #2"
9551     },
9552     /* We assume that DImode is only generated when not optimizing and
9553        that we don't really need 64-bit address offsets.  That would
9554        imply an object file with 8GB of code in a single function!  */
9555     {
9556       "ldr\t%w3, [%0,%w1,uxtw #2]",
9557       "add\t%3, %4, %w3, sxtw #2"
9558     }
9559   };
9560
9561   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
9562
9563   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
9564   index = exact_log2 (GET_MODE_SIZE (mode));
9565
9566   gcc_assert (index >= 0 && index <= 3);
9567
9568   /* Need to implement table size reduction, by chaning the code below.  */
9569   output_asm_insn (patterns[index][0], operands);
9570   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
9571   snprintf (buf, sizeof (buf),
9572             "adr\t%%4, %s", targetm.strip_name_encoding (label));
9573   output_asm_insn (buf, operands);
9574   output_asm_insn (patterns[index][1], operands);
9575   output_asm_insn ("br\t%3", operands);
9576   assemble_label (asm_out_file, label);
9577   return "";
9578 }
9579
9580
9581 /* Return size in bits of an arithmetic operand which is shifted/scaled and
9582    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
9583    operator.  */
9584
9585 int
9586 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
9587 {
9588   if (shift >= 0 && shift <= 3)
9589     {
9590       int size;
9591       for (size = 8; size <= 32; size *= 2)
9592         {
9593           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
9594           if (mask == bits << shift)
9595             return size;
9596         }
9597     }
9598   return 0;
9599 }
9600
9601 /* Constant pools are per function only when PC relative
9602    literal loads are true or we are in the large memory
9603    model.  */
9604
9605 static inline bool
9606 aarch64_can_use_per_function_literal_pools_p (void)
9607 {
9608   return (aarch64_pcrelative_literal_loads
9609           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
9610 }
9611
9612 static bool
9613 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
9614 {
9615   /* We can't use blocks for constants when we're using a per-function
9616      constant pool.  */
9617   return !aarch64_can_use_per_function_literal_pools_p ();
9618 }
9619
9620 /* Select appropriate section for constants depending
9621    on where we place literal pools.  */
9622
9623 static section *
9624 aarch64_select_rtx_section (machine_mode mode,
9625                             rtx x,
9626                             unsigned HOST_WIDE_INT align)
9627 {
9628   if (aarch64_can_use_per_function_literal_pools_p ())
9629     return function_section (current_function_decl);
9630
9631   return default_elf_select_rtx_section (mode, x, align);
9632 }
9633
9634 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
9635 void
9636 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
9637                                   HOST_WIDE_INT offset)
9638 {
9639   /* When using per-function literal pools, we must ensure that any code
9640      section is aligned to the minimal instruction length, lest we get
9641      errors from the assembler re "unaligned instructions".  */
9642   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
9643     ASM_OUTPUT_ALIGN (f, 2);
9644 }
9645
9646 /* Costs.  */
9647
9648 /* Helper function for rtx cost calculation.  Strip a shift expression
9649    from X.  Returns the inner operand if successful, or the original
9650    expression on failure.  */
9651 static rtx
9652 aarch64_strip_shift (rtx x)
9653 {
9654   rtx op = x;
9655
9656   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
9657      we can convert both to ROR during final output.  */
9658   if ((GET_CODE (op) == ASHIFT
9659        || GET_CODE (op) == ASHIFTRT
9660        || GET_CODE (op) == LSHIFTRT
9661        || GET_CODE (op) == ROTATERT
9662        || GET_CODE (op) == ROTATE)
9663       && CONST_INT_P (XEXP (op, 1)))
9664     return XEXP (op, 0);
9665
9666   if (GET_CODE (op) == MULT
9667       && CONST_INT_P (XEXP (op, 1))
9668       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
9669     return XEXP (op, 0);
9670
9671   return x;
9672 }
9673
9674 /* Helper function for rtx cost calculation.  Strip an extend
9675    expression from X.  Returns the inner operand if successful, or the
9676    original expression on failure.  We deal with a number of possible
9677    canonicalization variations here. If STRIP_SHIFT is true, then
9678    we can strip off a shift also.  */
9679 static rtx
9680 aarch64_strip_extend (rtx x, bool strip_shift)
9681 {
9682   scalar_int_mode mode;
9683   rtx op = x;
9684
9685   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
9686     return op;
9687
9688   /* Zero and sign extraction of a widened value.  */
9689   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
9690       && XEXP (op, 2) == const0_rtx
9691       && GET_CODE (XEXP (op, 0)) == MULT
9692       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
9693                                          XEXP (op, 1)))
9694     return XEXP (XEXP (op, 0), 0);
9695
9696   /* It can also be represented (for zero-extend) as an AND with an
9697      immediate.  */
9698   if (GET_CODE (op) == AND
9699       && GET_CODE (XEXP (op, 0)) == MULT
9700       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
9701       && CONST_INT_P (XEXP (op, 1))
9702       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
9703                            INTVAL (XEXP (op, 1))) != 0)
9704     return XEXP (XEXP (op, 0), 0);
9705
9706   /* Now handle extended register, as this may also have an optional
9707      left shift by 1..4.  */
9708   if (strip_shift
9709       && GET_CODE (op) == ASHIFT
9710       && CONST_INT_P (XEXP (op, 1))
9711       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
9712     op = XEXP (op, 0);
9713
9714   if (GET_CODE (op) == ZERO_EXTEND
9715       || GET_CODE (op) == SIGN_EXTEND)
9716     op = XEXP (op, 0);
9717
9718   if (op != x)
9719     return op;
9720
9721   return x;
9722 }
9723
9724 /* Return true iff CODE is a shift supported in combination
9725    with arithmetic instructions.  */
9726
9727 static bool
9728 aarch64_shift_p (enum rtx_code code)
9729 {
9730   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
9731 }
9732
9733
9734 /* Return true iff X is a cheap shift without a sign extend. */
9735
9736 static bool
9737 aarch64_cheap_mult_shift_p (rtx x)
9738 {
9739   rtx op0, op1;
9740
9741   op0 = XEXP (x, 0);
9742   op1 = XEXP (x, 1);
9743
9744   if (!(aarch64_tune_params.extra_tuning_flags
9745                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
9746     return false;
9747
9748   if (GET_CODE (op0) == SIGN_EXTEND)
9749     return false;
9750
9751   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
9752       && UINTVAL (op1) <= 4)
9753     return true;
9754
9755   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
9756     return false;
9757
9758   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
9759
9760   if (l2 > 0 && l2 <= 4)
9761     return true;
9762
9763   return false;
9764 }
9765
9766 /* Helper function for rtx cost calculation.  Calculate the cost of
9767    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
9768    Return the calculated cost of the expression, recursing manually in to
9769    operands where needed.  */
9770
9771 static int
9772 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
9773 {
9774   rtx op0, op1;
9775   const struct cpu_cost_table *extra_cost
9776     = aarch64_tune_params.insn_extra_cost;
9777   int cost = 0;
9778   bool compound_p = (outer == PLUS || outer == MINUS);
9779   machine_mode mode = GET_MODE (x);
9780
9781   gcc_checking_assert (code == MULT);
9782
9783   op0 = XEXP (x, 0);
9784   op1 = XEXP (x, 1);
9785
9786   if (VECTOR_MODE_P (mode))
9787     mode = GET_MODE_INNER (mode);
9788
9789   /* Integer multiply/fma.  */
9790   if (GET_MODE_CLASS (mode) == MODE_INT)
9791     {
9792       /* The multiply will be canonicalized as a shift, cost it as such.  */
9793       if (aarch64_shift_p (GET_CODE (x))
9794           || (CONST_INT_P (op1)
9795               && exact_log2 (INTVAL (op1)) > 0))
9796         {
9797           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
9798                            || GET_CODE (op0) == SIGN_EXTEND;
9799           if (speed)
9800             {
9801               if (compound_p)
9802                 {
9803                   /* If the shift is considered cheap,
9804                      then don't add any cost. */
9805                   if (aarch64_cheap_mult_shift_p (x))
9806                     ;
9807                   else if (REG_P (op1))
9808                     /* ARITH + shift-by-register.  */
9809                     cost += extra_cost->alu.arith_shift_reg;
9810                   else if (is_extend)
9811                     /* ARITH + extended register.  We don't have a cost field
9812                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
9813                     cost += extra_cost->alu.extend_arith;
9814                   else
9815                     /* ARITH + shift-by-immediate.  */
9816                     cost += extra_cost->alu.arith_shift;
9817                 }
9818               else
9819                 /* LSL (immediate).  */
9820                 cost += extra_cost->alu.shift;
9821
9822             }
9823           /* Strip extends as we will have costed them in the case above.  */
9824           if (is_extend)
9825             op0 = aarch64_strip_extend (op0, true);
9826
9827           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
9828
9829           return cost;
9830         }
9831
9832       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
9833          compound and let the below cases handle it.  After all, MNEG is a
9834          special-case alias of MSUB.  */
9835       if (GET_CODE (op0) == NEG)
9836         {
9837           op0 = XEXP (op0, 0);
9838           compound_p = true;
9839         }
9840
9841       /* Integer multiplies or FMAs have zero/sign extending variants.  */
9842       if ((GET_CODE (op0) == ZERO_EXTEND
9843            && GET_CODE (op1) == ZERO_EXTEND)
9844           || (GET_CODE (op0) == SIGN_EXTEND
9845               && GET_CODE (op1) == SIGN_EXTEND))
9846         {
9847           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
9848           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
9849
9850           if (speed)
9851             {
9852               if (compound_p)
9853                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
9854                 cost += extra_cost->mult[0].extend_add;
9855               else
9856                 /* MUL/SMULL/UMULL.  */
9857                 cost += extra_cost->mult[0].extend;
9858             }
9859
9860           return cost;
9861         }
9862
9863       /* This is either an integer multiply or a MADD.  In both cases
9864          we want to recurse and cost the operands.  */
9865       cost += rtx_cost (op0, mode, MULT, 0, speed);
9866       cost += rtx_cost (op1, mode, MULT, 1, speed);
9867
9868       if (speed)
9869         {
9870           if (compound_p)
9871             /* MADD/MSUB.  */
9872             cost += extra_cost->mult[mode == DImode].add;
9873           else
9874             /* MUL.  */
9875             cost += extra_cost->mult[mode == DImode].simple;
9876         }
9877
9878       return cost;
9879     }
9880   else
9881     {
9882       if (speed)
9883         {
9884           /* Floating-point FMA/FMUL can also support negations of the
9885              operands, unless the rounding mode is upward or downward in
9886              which case FNMUL is different than FMUL with operand negation.  */
9887           bool neg0 = GET_CODE (op0) == NEG;
9888           bool neg1 = GET_CODE (op1) == NEG;
9889           if (compound_p || !flag_rounding_math || (neg0 && neg1))
9890             {
9891               if (neg0)
9892                 op0 = XEXP (op0, 0);
9893               if (neg1)
9894                 op1 = XEXP (op1, 0);
9895             }
9896
9897           if (compound_p)
9898             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
9899             cost += extra_cost->fp[mode == DFmode].fma;
9900           else
9901             /* FMUL/FNMUL.  */
9902             cost += extra_cost->fp[mode == DFmode].mult;
9903         }
9904
9905       cost += rtx_cost (op0, mode, MULT, 0, speed);
9906       cost += rtx_cost (op1, mode, MULT, 1, speed);
9907       return cost;
9908     }
9909 }
9910
9911 static int
9912 aarch64_address_cost (rtx x,
9913                       machine_mode mode,
9914                       addr_space_t as ATTRIBUTE_UNUSED,
9915                       bool speed)
9916 {
9917   enum rtx_code c = GET_CODE (x);
9918   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
9919   struct aarch64_address_info info;
9920   int cost = 0;
9921   info.shift = 0;
9922
9923   if (!aarch64_classify_address (&info, x, mode, false))
9924     {
9925       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
9926         {
9927           /* This is a CONST or SYMBOL ref which will be split
9928              in a different way depending on the code model in use.
9929              Cost it through the generic infrastructure.  */
9930           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
9931           /* Divide through by the cost of one instruction to
9932              bring it to the same units as the address costs.  */
9933           cost_symbol_ref /= COSTS_N_INSNS (1);
9934           /* The cost is then the cost of preparing the address,
9935              followed by an immediate (possibly 0) offset.  */
9936           return cost_symbol_ref + addr_cost->imm_offset;
9937         }
9938       else
9939         {
9940           /* This is most likely a jump table from a case
9941              statement.  */
9942           return addr_cost->register_offset;
9943         }
9944     }
9945
9946   switch (info.type)
9947     {
9948       case ADDRESS_LO_SUM:
9949       case ADDRESS_SYMBOLIC:
9950       case ADDRESS_REG_IMM:
9951         cost += addr_cost->imm_offset;
9952         break;
9953
9954       case ADDRESS_REG_WB:
9955         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
9956           cost += addr_cost->pre_modify;
9957         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
9958           cost += addr_cost->post_modify;
9959         else
9960           gcc_unreachable ();
9961
9962         break;
9963
9964       case ADDRESS_REG_REG:
9965         cost += addr_cost->register_offset;
9966         break;
9967
9968       case ADDRESS_REG_SXTW:
9969         cost += addr_cost->register_sextend;
9970         break;
9971
9972       case ADDRESS_REG_UXTW:
9973         cost += addr_cost->register_zextend;
9974         break;
9975
9976       default:
9977         gcc_unreachable ();
9978     }
9979
9980
9981   if (info.shift > 0)
9982     {
9983       /* For the sake of calculating the cost of the shifted register
9984          component, we can treat same sized modes in the same way.  */
9985       if (known_eq (GET_MODE_BITSIZE (mode), 16))
9986         cost += addr_cost->addr_scale_costs.hi;
9987       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9988         cost += addr_cost->addr_scale_costs.si;
9989       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9990         cost += addr_cost->addr_scale_costs.di;
9991       else
9992         /* We can't tell, or this is a 128-bit vector.  */
9993         cost += addr_cost->addr_scale_costs.ti;
9994     }
9995
9996   return cost;
9997 }
9998
9999 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
10000    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
10001    to be taken.  */
10002
10003 int
10004 aarch64_branch_cost (bool speed_p, bool predictable_p)
10005 {
10006   /* When optimizing for speed, use the cost of unpredictable branches.  */
10007   const struct cpu_branch_cost *branch_costs =
10008     aarch64_tune_params.branch_costs;
10009
10010   if (!speed_p || predictable_p)
10011     return branch_costs->predictable;
10012   else
10013     return branch_costs->unpredictable;
10014 }
10015
10016 /* Return true if the RTX X in mode MODE is a zero or sign extract
10017    usable in an ADD or SUB (extended register) instruction.  */
10018 static bool
10019 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
10020 {
10021   /* Catch add with a sign extract.
10022      This is add_<optab><mode>_multp2.  */
10023   if (GET_CODE (x) == SIGN_EXTRACT
10024       || GET_CODE (x) == ZERO_EXTRACT)
10025     {
10026       rtx op0 = XEXP (x, 0);
10027       rtx op1 = XEXP (x, 1);
10028       rtx op2 = XEXP (x, 2);
10029
10030       if (GET_CODE (op0) == MULT
10031           && CONST_INT_P (op1)
10032           && op2 == const0_rtx
10033           && CONST_INT_P (XEXP (op0, 1))
10034           && aarch64_is_extend_from_extract (mode,
10035                                              XEXP (op0, 1),
10036                                              op1))
10037         {
10038           return true;
10039         }
10040     }
10041   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
10042      No shift.  */
10043   else if (GET_CODE (x) == SIGN_EXTEND
10044            || GET_CODE (x) == ZERO_EXTEND)
10045     return REG_P (XEXP (x, 0));
10046
10047   return false;
10048 }
10049
10050 static bool
10051 aarch64_frint_unspec_p (unsigned int u)
10052 {
10053   switch (u)
10054     {
10055       case UNSPEC_FRINTZ:
10056       case UNSPEC_FRINTP:
10057       case UNSPEC_FRINTM:
10058       case UNSPEC_FRINTA:
10059       case UNSPEC_FRINTN:
10060       case UNSPEC_FRINTX:
10061       case UNSPEC_FRINTI:
10062         return true;
10063
10064       default:
10065         return false;
10066     }
10067 }
10068
10069 /* Return true iff X is an rtx that will match an extr instruction
10070    i.e. as described in the *extr<mode>5_insn family of patterns.
10071    OP0 and OP1 will be set to the operands of the shifts involved
10072    on success and will be NULL_RTX otherwise.  */
10073
10074 static bool
10075 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
10076 {
10077   rtx op0, op1;
10078   scalar_int_mode mode;
10079   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
10080     return false;
10081
10082   *res_op0 = NULL_RTX;
10083   *res_op1 = NULL_RTX;
10084
10085   if (GET_CODE (x) != IOR)
10086     return false;
10087
10088   op0 = XEXP (x, 0);
10089   op1 = XEXP (x, 1);
10090
10091   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
10092       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
10093     {
10094      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
10095       if (GET_CODE (op1) == ASHIFT)
10096         std::swap (op0, op1);
10097
10098       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
10099         return false;
10100
10101       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
10102       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
10103
10104       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
10105           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
10106         {
10107           *res_op0 = XEXP (op0, 0);
10108           *res_op1 = XEXP (op1, 0);
10109           return true;
10110         }
10111     }
10112
10113   return false;
10114 }
10115
10116 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
10117    storing it in *COST.  Result is true if the total cost of the operation
10118    has now been calculated.  */
10119 static bool
10120 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
10121 {
10122   rtx inner;
10123   rtx comparator;
10124   enum rtx_code cmpcode;
10125
10126   if (COMPARISON_P (op0))
10127     {
10128       inner = XEXP (op0, 0);
10129       comparator = XEXP (op0, 1);
10130       cmpcode = GET_CODE (op0);
10131     }
10132   else
10133     {
10134       inner = op0;
10135       comparator = const0_rtx;
10136       cmpcode = NE;
10137     }
10138
10139   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
10140     {
10141       /* Conditional branch.  */
10142       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10143         return true;
10144       else
10145         {
10146           if (cmpcode == NE || cmpcode == EQ)
10147             {
10148               if (comparator == const0_rtx)
10149                 {
10150                   /* TBZ/TBNZ/CBZ/CBNZ.  */
10151                   if (GET_CODE (inner) == ZERO_EXTRACT)
10152                     /* TBZ/TBNZ.  */
10153                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
10154                                        ZERO_EXTRACT, 0, speed);
10155                   else
10156                     /* CBZ/CBNZ.  */
10157                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
10158
10159                 return true;
10160               }
10161             }
10162           else if (cmpcode == LT || cmpcode == GE)
10163             {
10164               /* TBZ/TBNZ.  */
10165               if (comparator == const0_rtx)
10166                 return true;
10167             }
10168         }
10169     }
10170   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10171     {
10172       /* CCMP.  */
10173       if (GET_CODE (op1) == COMPARE)
10174         {
10175           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
10176           if (XEXP (op1, 1) == const0_rtx)
10177             *cost += 1;
10178           if (speed)
10179             {
10180               machine_mode mode = GET_MODE (XEXP (op1, 0));
10181               const struct cpu_cost_table *extra_cost
10182                 = aarch64_tune_params.insn_extra_cost;
10183
10184               if (GET_MODE_CLASS (mode) == MODE_INT)
10185                 *cost += extra_cost->alu.arith;
10186               else
10187                 *cost += extra_cost->fp[mode == DFmode].compare;
10188             }
10189           return true;
10190         }
10191
10192       /* It's a conditional operation based on the status flags,
10193          so it must be some flavor of CSEL.  */
10194
10195       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
10196       if (GET_CODE (op1) == NEG
10197           || GET_CODE (op1) == NOT
10198           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
10199         op1 = XEXP (op1, 0);
10200       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
10201         {
10202           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
10203           op1 = XEXP (op1, 0);
10204           op2 = XEXP (op2, 0);
10205         }
10206
10207       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
10208       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
10209       return true;
10210     }
10211
10212   /* We don't know what this is, cost all operands.  */
10213   return false;
10214 }
10215
10216 /* Check whether X is a bitfield operation of the form shift + extend that
10217    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
10218    operand to which the bitfield operation is applied.  Otherwise return
10219    NULL_RTX.  */
10220
10221 static rtx
10222 aarch64_extend_bitfield_pattern_p (rtx x)
10223 {
10224   rtx_code outer_code = GET_CODE (x);
10225   machine_mode outer_mode = GET_MODE (x);
10226
10227   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
10228       && outer_mode != SImode && outer_mode != DImode)
10229     return NULL_RTX;
10230
10231   rtx inner = XEXP (x, 0);
10232   rtx_code inner_code = GET_CODE (inner);
10233   machine_mode inner_mode = GET_MODE (inner);
10234   rtx op = NULL_RTX;
10235
10236   switch (inner_code)
10237     {
10238       case ASHIFT:
10239         if (CONST_INT_P (XEXP (inner, 1))
10240             && (inner_mode == QImode || inner_mode == HImode))
10241           op = XEXP (inner, 0);
10242         break;
10243       case LSHIFTRT:
10244         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
10245             && (inner_mode == QImode || inner_mode == HImode))
10246           op = XEXP (inner, 0);
10247         break;
10248       case ASHIFTRT:
10249         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
10250             && (inner_mode == QImode || inner_mode == HImode))
10251           op = XEXP (inner, 0);
10252         break;
10253       default:
10254         break;
10255     }
10256
10257   return op;
10258 }
10259
10260 /* Return true if the mask and a shift amount from an RTX of the form
10261    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
10262    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
10263
10264 bool
10265 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
10266                                     rtx shft_amnt)
10267 {
10268   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
10269          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
10270          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
10271          && (INTVAL (mask)
10272              & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
10273 }
10274
10275 /* Return true if the masks and a shift amount from an RTX of the form
10276    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
10277    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
10278
10279 bool
10280 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
10281                                    unsigned HOST_WIDE_INT mask1,
10282                                    unsigned HOST_WIDE_INT shft_amnt,
10283                                    unsigned HOST_WIDE_INT mask2)
10284 {
10285   unsigned HOST_WIDE_INT t;
10286
10287   /* Verify that there is no overlap in what bits are set in the two masks.  */
10288   if (mask1 != ~mask2)
10289     return false;
10290
10291   /* Verify that mask2 is not all zeros or ones.  */
10292   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
10293     return false;
10294
10295   /* The shift amount should always be less than the mode size.  */
10296   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
10297
10298   /* Verify that the mask being shifted is contiguous and would be in the
10299      least significant bits after shifting by shft_amnt.  */
10300   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
10301   return (t == (t & -t));
10302 }
10303
10304 /* Calculate the cost of calculating X, storing it in *COST.  Result
10305    is true if the total cost of the operation has now been calculated.  */
10306 static bool
10307 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
10308                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
10309 {
10310   rtx op0, op1, op2;
10311   const struct cpu_cost_table *extra_cost
10312     = aarch64_tune_params.insn_extra_cost;
10313   int code = GET_CODE (x);
10314   scalar_int_mode int_mode;
10315
10316   /* By default, assume that everything has equivalent cost to the
10317      cheapest instruction.  Any additional costs are applied as a delta
10318      above this default.  */
10319   *cost = COSTS_N_INSNS (1);
10320
10321   switch (code)
10322     {
10323     case SET:
10324       /* The cost depends entirely on the operands to SET.  */
10325       *cost = 0;
10326       op0 = SET_DEST (x);
10327       op1 = SET_SRC (x);
10328
10329       switch (GET_CODE (op0))
10330         {
10331         case MEM:
10332           if (speed)
10333             {
10334               rtx address = XEXP (op0, 0);
10335               if (VECTOR_MODE_P (mode))
10336                 *cost += extra_cost->ldst.storev;
10337               else if (GET_MODE_CLASS (mode) == MODE_INT)
10338                 *cost += extra_cost->ldst.store;
10339               else if (mode == SFmode)
10340                 *cost += extra_cost->ldst.storef;
10341               else if (mode == DFmode)
10342                 *cost += extra_cost->ldst.stored;
10343
10344               *cost +=
10345                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10346                                                      0, speed));
10347             }
10348
10349           *cost += rtx_cost (op1, mode, SET, 1, speed);
10350           return true;
10351
10352         case SUBREG:
10353           if (! REG_P (SUBREG_REG (op0)))
10354             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
10355
10356           /* Fall through.  */
10357         case REG:
10358           /* The cost is one per vector-register copied.  */
10359           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
10360             {
10361               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
10362               *cost = COSTS_N_INSNS (nregs);
10363             }
10364           /* const0_rtx is in general free, but we will use an
10365              instruction to set a register to 0.  */
10366           else if (REG_P (op1) || op1 == const0_rtx)
10367             {
10368               /* The cost is 1 per register copied.  */
10369               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
10370               *cost = COSTS_N_INSNS (nregs);
10371             }
10372           else
10373             /* Cost is just the cost of the RHS of the set.  */
10374             *cost += rtx_cost (op1, mode, SET, 1, speed);
10375           return true;
10376
10377         case ZERO_EXTRACT:
10378         case SIGN_EXTRACT:
10379           /* Bit-field insertion.  Strip any redundant widening of
10380              the RHS to meet the width of the target.  */
10381           if (GET_CODE (op1) == SUBREG)
10382             op1 = SUBREG_REG (op1);
10383           if ((GET_CODE (op1) == ZERO_EXTEND
10384                || GET_CODE (op1) == SIGN_EXTEND)
10385               && CONST_INT_P (XEXP (op0, 1))
10386               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
10387               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
10388             op1 = XEXP (op1, 0);
10389
10390           if (CONST_INT_P (op1))
10391             {
10392               /* MOV immediate is assumed to always be cheap.  */
10393               *cost = COSTS_N_INSNS (1);
10394             }
10395           else
10396             {
10397               /* BFM.  */
10398               if (speed)
10399                 *cost += extra_cost->alu.bfi;
10400               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
10401             }
10402
10403           return true;
10404
10405         default:
10406           /* We can't make sense of this, assume default cost.  */
10407           *cost = COSTS_N_INSNS (1);
10408           return false;
10409         }
10410       return false;
10411
10412     case CONST_INT:
10413       /* If an instruction can incorporate a constant within the
10414          instruction, the instruction's expression avoids calling
10415          rtx_cost() on the constant.  If rtx_cost() is called on a
10416          constant, then it is usually because the constant must be
10417          moved into a register by one or more instructions.
10418
10419          The exception is constant 0, which can be expressed
10420          as XZR/WZR and is therefore free.  The exception to this is
10421          if we have (set (reg) (const0_rtx)) in which case we must cost
10422          the move.  However, we can catch that when we cost the SET, so
10423          we don't need to consider that here.  */
10424       if (x == const0_rtx)
10425         *cost = 0;
10426       else
10427         {
10428           /* To an approximation, building any other constant is
10429              proportionally expensive to the number of instructions
10430              required to build that constant.  This is true whether we
10431              are compiling for SPEED or otherwise.  */
10432           if (!is_a <scalar_int_mode> (mode, &int_mode))
10433             int_mode = word_mode;
10434           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
10435                                  (NULL_RTX, x, false, int_mode));
10436         }
10437       return true;
10438
10439     case CONST_DOUBLE:
10440
10441       /* First determine number of instructions to do the move
10442           as an integer constant.  */
10443       if (!aarch64_float_const_representable_p (x)
10444            && !aarch64_can_const_movi_rtx_p (x, mode)
10445            && aarch64_float_const_rtx_p (x))
10446         {
10447           unsigned HOST_WIDE_INT ival;
10448           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
10449           gcc_assert (succeed);
10450
10451           scalar_int_mode imode = (mode == HFmode
10452                                    ? SImode
10453                                    : int_mode_for_mode (mode).require ());
10454           int ncost = aarch64_internal_mov_immediate
10455                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
10456           *cost += COSTS_N_INSNS (ncost);
10457           return true;
10458         }
10459
10460       if (speed)
10461         {
10462           /* mov[df,sf]_aarch64.  */
10463           if (aarch64_float_const_representable_p (x))
10464             /* FMOV (scalar immediate).  */
10465             *cost += extra_cost->fp[mode == DFmode].fpconst;
10466           else if (!aarch64_float_const_zero_rtx_p (x))
10467             {
10468               /* This will be a load from memory.  */
10469               if (mode == DFmode)
10470                 *cost += extra_cost->ldst.loadd;
10471               else
10472                 *cost += extra_cost->ldst.loadf;
10473             }
10474           else
10475             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
10476                or MOV v0.s[0], wzr - neither of which are modeled by the
10477                cost tables.  Just use the default cost.  */
10478             {
10479             }
10480         }
10481
10482       return true;
10483
10484     case MEM:
10485       if (speed)
10486         {
10487           /* For loads we want the base cost of a load, plus an
10488              approximation for the additional cost of the addressing
10489              mode.  */
10490           rtx address = XEXP (x, 0);
10491           if (VECTOR_MODE_P (mode))
10492             *cost += extra_cost->ldst.loadv;
10493           else if (GET_MODE_CLASS (mode) == MODE_INT)
10494             *cost += extra_cost->ldst.load;
10495           else if (mode == SFmode)
10496             *cost += extra_cost->ldst.loadf;
10497           else if (mode == DFmode)
10498             *cost += extra_cost->ldst.loadd;
10499
10500           *cost +=
10501                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10502                                                      0, speed));
10503         }
10504
10505       return true;
10506
10507     case NEG:
10508       op0 = XEXP (x, 0);
10509
10510       if (VECTOR_MODE_P (mode))
10511         {
10512           if (speed)
10513             {
10514               /* FNEG.  */
10515               *cost += extra_cost->vect.alu;
10516             }
10517           return false;
10518         }
10519
10520       if (GET_MODE_CLASS (mode) == MODE_INT)
10521         {
10522           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10523               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10524             {
10525               /* CSETM.  */
10526               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
10527               return true;
10528             }
10529
10530           /* Cost this as SUB wzr, X.  */
10531           op0 = CONST0_RTX (mode);
10532           op1 = XEXP (x, 0);
10533           goto cost_minus;
10534         }
10535
10536       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10537         {
10538           /* Support (neg(fma...)) as a single instruction only if
10539              sign of zeros is unimportant.  This matches the decision
10540              making in aarch64.md.  */
10541           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
10542             {
10543               /* FNMADD.  */
10544               *cost = rtx_cost (op0, mode, NEG, 0, speed);
10545               return true;
10546             }
10547           if (GET_CODE (op0) == MULT)
10548             {
10549               /* FNMUL.  */
10550               *cost = rtx_cost (op0, mode, NEG, 0, speed);
10551               return true;
10552             }
10553           if (speed)
10554             /* FNEG.  */
10555             *cost += extra_cost->fp[mode == DFmode].neg;
10556           return false;
10557         }
10558
10559       return false;
10560
10561     case CLRSB:
10562     case CLZ:
10563       if (speed)
10564         {
10565           if (VECTOR_MODE_P (mode))
10566             *cost += extra_cost->vect.alu;
10567           else
10568             *cost += extra_cost->alu.clz;
10569         }
10570
10571       return false;
10572
10573     case COMPARE:
10574       op0 = XEXP (x, 0);
10575       op1 = XEXP (x, 1);
10576
10577       if (op1 == const0_rtx
10578           && GET_CODE (op0) == AND)
10579         {
10580           x = op0;
10581           mode = GET_MODE (op0);
10582           goto cost_logic;
10583         }
10584
10585       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
10586         {
10587           /* TODO: A write to the CC flags possibly costs extra, this
10588              needs encoding in the cost tables.  */
10589
10590           mode = GET_MODE (op0);
10591           /* ANDS.  */
10592           if (GET_CODE (op0) == AND)
10593             {
10594               x = op0;
10595               goto cost_logic;
10596             }
10597
10598           if (GET_CODE (op0) == PLUS)
10599             {
10600               /* ADDS (and CMN alias).  */
10601               x = op0;
10602               goto cost_plus;
10603             }
10604
10605           if (GET_CODE (op0) == MINUS)
10606             {
10607               /* SUBS.  */
10608               x = op0;
10609               goto cost_minus;
10610             }
10611
10612           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
10613               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
10614               && CONST_INT_P (XEXP (op0, 2)))
10615             {
10616               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
10617                  Handle it here directly rather than going to cost_logic
10618                  since we know the immediate generated for the TST is valid
10619                  so we can avoid creating an intermediate rtx for it only
10620                  for costing purposes.  */
10621               if (speed)
10622                 *cost += extra_cost->alu.logical;
10623
10624               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
10625                                  ZERO_EXTRACT, 0, speed);
10626               return true;
10627             }
10628
10629           if (GET_CODE (op1) == NEG)
10630             {
10631               /* CMN.  */
10632               if (speed)
10633                 *cost += extra_cost->alu.arith;
10634
10635               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
10636               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
10637               return true;
10638             }
10639
10640           /* CMP.
10641
10642              Compare can freely swap the order of operands, and
10643              canonicalization puts the more complex operation first.
10644              But the integer MINUS logic expects the shift/extend
10645              operation in op1.  */
10646           if (! (REG_P (op0)
10647                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
10648           {
10649             op0 = XEXP (x, 1);
10650             op1 = XEXP (x, 0);
10651           }
10652           goto cost_minus;
10653         }
10654
10655       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
10656         {
10657           /* FCMP.  */
10658           if (speed)
10659             *cost += extra_cost->fp[mode == DFmode].compare;
10660
10661           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
10662             {
10663               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
10664               /* FCMP supports constant 0.0 for no extra cost. */
10665               return true;
10666             }
10667           return false;
10668         }
10669
10670       if (VECTOR_MODE_P (mode))
10671         {
10672           /* Vector compare.  */
10673           if (speed)
10674             *cost += extra_cost->vect.alu;
10675
10676           if (aarch64_float_const_zero_rtx_p (op1))
10677             {
10678               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
10679                  cost.  */
10680               return true;
10681             }
10682           return false;
10683         }
10684       return false;
10685
10686     case MINUS:
10687       {
10688         op0 = XEXP (x, 0);
10689         op1 = XEXP (x, 1);
10690
10691 cost_minus:
10692         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
10693
10694         /* Detect valid immediates.  */
10695         if ((GET_MODE_CLASS (mode) == MODE_INT
10696              || (GET_MODE_CLASS (mode) == MODE_CC
10697                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
10698             && CONST_INT_P (op1)
10699             && aarch64_uimm12_shift (INTVAL (op1)))
10700           {
10701             if (speed)
10702               /* SUB(S) (immediate).  */
10703               *cost += extra_cost->alu.arith;
10704             return true;
10705           }
10706
10707         /* Look for SUB (extended register).  */
10708         if (is_a <scalar_int_mode> (mode, &int_mode)
10709             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
10710           {
10711             if (speed)
10712               *cost += extra_cost->alu.extend_arith;
10713
10714             op1 = aarch64_strip_extend (op1, true);
10715             *cost += rtx_cost (op1, VOIDmode,
10716                                (enum rtx_code) GET_CODE (op1), 0, speed);
10717             return true;
10718           }
10719
10720         rtx new_op1 = aarch64_strip_extend (op1, false);
10721
10722         /* Cost this as an FMA-alike operation.  */
10723         if ((GET_CODE (new_op1) == MULT
10724              || aarch64_shift_p (GET_CODE (new_op1)))
10725             && code != COMPARE)
10726           {
10727             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
10728                                             (enum rtx_code) code,
10729                                             speed);
10730             return true;
10731           }
10732
10733         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
10734
10735         if (speed)
10736           {
10737             if (VECTOR_MODE_P (mode))
10738               {
10739                 /* Vector SUB.  */
10740                 *cost += extra_cost->vect.alu;
10741               }
10742             else if (GET_MODE_CLASS (mode) == MODE_INT)
10743               {
10744                 /* SUB(S).  */
10745                 *cost += extra_cost->alu.arith;
10746               }
10747             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10748               {
10749                 /* FSUB.  */
10750                 *cost += extra_cost->fp[mode == DFmode].addsub;
10751               }
10752           }
10753         return true;
10754       }
10755
10756     case PLUS:
10757       {
10758         rtx new_op0;
10759
10760         op0 = XEXP (x, 0);
10761         op1 = XEXP (x, 1);
10762
10763 cost_plus:
10764         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10765             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10766           {
10767             /* CSINC.  */
10768             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
10769             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10770             return true;
10771           }
10772
10773         if (GET_MODE_CLASS (mode) == MODE_INT
10774             && (aarch64_plus_immediate (op1, mode)
10775                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
10776           {
10777             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
10778
10779             if (speed)
10780               /* ADD (immediate).  */
10781               *cost += extra_cost->alu.arith;
10782             return true;
10783           }
10784
10785         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10786
10787         /* Look for ADD (extended register).  */
10788         if (is_a <scalar_int_mode> (mode, &int_mode)
10789             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
10790           {
10791             if (speed)
10792               *cost += extra_cost->alu.extend_arith;
10793
10794             op0 = aarch64_strip_extend (op0, true);
10795             *cost += rtx_cost (op0, VOIDmode,
10796                                (enum rtx_code) GET_CODE (op0), 0, speed);
10797             return true;
10798           }
10799
10800         /* Strip any extend, leave shifts behind as we will
10801            cost them through mult_cost.  */
10802         new_op0 = aarch64_strip_extend (op0, false);
10803
10804         if (GET_CODE (new_op0) == MULT
10805             || aarch64_shift_p (GET_CODE (new_op0)))
10806           {
10807             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
10808                                             speed);
10809             return true;
10810           }
10811
10812         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
10813
10814         if (speed)
10815           {
10816             if (VECTOR_MODE_P (mode))
10817               {
10818                 /* Vector ADD.  */
10819                 *cost += extra_cost->vect.alu;
10820               }
10821             else if (GET_MODE_CLASS (mode) == MODE_INT)
10822               {
10823                 /* ADD.  */
10824                 *cost += extra_cost->alu.arith;
10825               }
10826             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10827               {
10828                 /* FADD.  */
10829                 *cost += extra_cost->fp[mode == DFmode].addsub;
10830               }
10831           }
10832         return true;
10833       }
10834
10835     case BSWAP:
10836       *cost = COSTS_N_INSNS (1);
10837
10838       if (speed)
10839         {
10840           if (VECTOR_MODE_P (mode))
10841             *cost += extra_cost->vect.alu;
10842           else
10843             *cost += extra_cost->alu.rev;
10844         }
10845       return false;
10846
10847     case IOR:
10848       if (aarch_rev16_p (x))
10849         {
10850           *cost = COSTS_N_INSNS (1);
10851
10852           if (speed)
10853             {
10854               if (VECTOR_MODE_P (mode))
10855                 *cost += extra_cost->vect.alu;
10856               else
10857                 *cost += extra_cost->alu.rev;
10858             }
10859           return true;
10860         }
10861
10862       if (aarch64_extr_rtx_p (x, &op0, &op1))
10863         {
10864           *cost += rtx_cost (op0, mode, IOR, 0, speed);
10865           *cost += rtx_cost (op1, mode, IOR, 1, speed);
10866           if (speed)
10867             *cost += extra_cost->alu.shift;
10868
10869           return true;
10870         }
10871     /* Fall through.  */
10872     case XOR:
10873     case AND:
10874     cost_logic:
10875       op0 = XEXP (x, 0);
10876       op1 = XEXP (x, 1);
10877
10878       if (VECTOR_MODE_P (mode))
10879         {
10880           if (speed)
10881             *cost += extra_cost->vect.alu;
10882           return true;
10883         }
10884
10885       if (code == AND
10886           && GET_CODE (op0) == MULT
10887           && CONST_INT_P (XEXP (op0, 1))
10888           && CONST_INT_P (op1)
10889           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
10890                                INTVAL (op1)) != 0)
10891         {
10892           /* This is a UBFM/SBFM.  */
10893           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
10894           if (speed)
10895             *cost += extra_cost->alu.bfx;
10896           return true;
10897         }
10898
10899       if (is_int_mode (mode, &int_mode))
10900         {
10901           if (CONST_INT_P (op1))
10902             {
10903               /* We have a mask + shift version of a UBFIZ
10904                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
10905               if (GET_CODE (op0) == ASHIFT
10906                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
10907                                                          XEXP (op0, 1)))
10908                 {
10909                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
10910                                      (enum rtx_code) code, 0, speed);
10911                   if (speed)
10912                     *cost += extra_cost->alu.bfx;
10913
10914                   return true;
10915                 }
10916               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
10917                 {
10918                 /* We possibly get the immediate for free, this is not
10919                    modelled.  */
10920                   *cost += rtx_cost (op0, int_mode,
10921                                      (enum rtx_code) code, 0, speed);
10922                   if (speed)
10923                     *cost += extra_cost->alu.logical;
10924
10925                   return true;
10926                 }
10927             }
10928           else
10929             {
10930               rtx new_op0 = op0;
10931
10932               /* Handle ORN, EON, or BIC.  */
10933               if (GET_CODE (op0) == NOT)
10934                 op0 = XEXP (op0, 0);
10935
10936               new_op0 = aarch64_strip_shift (op0);
10937
10938               /* If we had a shift on op0 then this is a logical-shift-
10939                  by-register/immediate operation.  Otherwise, this is just
10940                  a logical operation.  */
10941               if (speed)
10942                 {
10943                   if (new_op0 != op0)
10944                     {
10945                       /* Shift by immediate.  */
10946                       if (CONST_INT_P (XEXP (op0, 1)))
10947                         *cost += extra_cost->alu.log_shift;
10948                       else
10949                         *cost += extra_cost->alu.log_shift_reg;
10950                     }
10951                   else
10952                     *cost += extra_cost->alu.logical;
10953                 }
10954
10955               /* In both cases we want to cost both operands.  */
10956               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
10957                                  0, speed);
10958               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
10959                                  1, speed);
10960
10961               return true;
10962             }
10963         }
10964       return false;
10965
10966     case NOT:
10967       x = XEXP (x, 0);
10968       op0 = aarch64_strip_shift (x);
10969
10970       if (VECTOR_MODE_P (mode))
10971         {
10972           /* Vector NOT.  */
10973           *cost += extra_cost->vect.alu;
10974           return false;
10975         }
10976
10977       /* MVN-shifted-reg.  */
10978       if (op0 != x)
10979         {
10980           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10981
10982           if (speed)
10983             *cost += extra_cost->alu.log_shift;
10984
10985           return true;
10986         }
10987       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10988          Handle the second form here taking care that 'a' in the above can
10989          be a shift.  */
10990       else if (GET_CODE (op0) == XOR)
10991         {
10992           rtx newop0 = XEXP (op0, 0);
10993           rtx newop1 = XEXP (op0, 1);
10994           rtx op0_stripped = aarch64_strip_shift (newop0);
10995
10996           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
10997           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
10998
10999           if (speed)
11000             {
11001               if (op0_stripped != newop0)
11002                 *cost += extra_cost->alu.log_shift;
11003               else
11004                 *cost += extra_cost->alu.logical;
11005             }
11006
11007           return true;
11008         }
11009       /* MVN.  */
11010       if (speed)
11011         *cost += extra_cost->alu.logical;
11012
11013       return false;
11014
11015     case ZERO_EXTEND:
11016
11017       op0 = XEXP (x, 0);
11018       /* If a value is written in SI mode, then zero extended to DI
11019          mode, the operation will in general be free as a write to
11020          a 'w' register implicitly zeroes the upper bits of an 'x'
11021          register.  However, if this is
11022
11023            (set (reg) (zero_extend (reg)))
11024
11025          we must cost the explicit register move.  */
11026       if (mode == DImode
11027           && GET_MODE (op0) == SImode
11028           && outer == SET)
11029         {
11030           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
11031
11032         /* If OP_COST is non-zero, then the cost of the zero extend
11033            is effectively the cost of the inner operation.  Otherwise
11034            we have a MOV instruction and we take the cost from the MOV
11035            itself.  This is true independently of whether we are
11036            optimizing for space or time.  */
11037           if (op_cost)
11038             *cost = op_cost;
11039
11040           return true;
11041         }
11042       else if (MEM_P (op0))
11043         {
11044           /* All loads can zero extend to any size for free.  */
11045           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
11046           return true;
11047         }
11048
11049       op0 = aarch64_extend_bitfield_pattern_p (x);
11050       if (op0)
11051         {
11052           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
11053           if (speed)
11054             *cost += extra_cost->alu.bfx;
11055           return true;
11056         }
11057
11058       if (speed)
11059         {
11060           if (VECTOR_MODE_P (mode))
11061             {
11062               /* UMOV.  */
11063               *cost += extra_cost->vect.alu;
11064             }
11065           else
11066             {
11067               /* We generate an AND instead of UXTB/UXTH.  */
11068               *cost += extra_cost->alu.logical;
11069             }
11070         }
11071       return false;
11072
11073     case SIGN_EXTEND:
11074       if (MEM_P (XEXP (x, 0)))
11075         {
11076           /* LDRSH.  */
11077           if (speed)
11078             {
11079               rtx address = XEXP (XEXP (x, 0), 0);
11080               *cost += extra_cost->ldst.load_sign_extend;
11081
11082               *cost +=
11083                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11084                                                      0, speed));
11085             }
11086           return true;
11087         }
11088
11089       op0 = aarch64_extend_bitfield_pattern_p (x);
11090       if (op0)
11091         {
11092           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
11093           if (speed)
11094             *cost += extra_cost->alu.bfx;
11095           return true;
11096         }
11097
11098       if (speed)
11099         {
11100           if (VECTOR_MODE_P (mode))
11101             *cost += extra_cost->vect.alu;
11102           else
11103             *cost += extra_cost->alu.extend;
11104         }
11105       return false;
11106
11107     case ASHIFT:
11108       op0 = XEXP (x, 0);
11109       op1 = XEXP (x, 1);
11110
11111       if (CONST_INT_P (op1))
11112         {
11113           if (speed)
11114             {
11115               if (VECTOR_MODE_P (mode))
11116                 {
11117                   /* Vector shift (immediate).  */
11118                   *cost += extra_cost->vect.alu;
11119                 }
11120               else
11121                 {
11122                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
11123                      aliases.  */
11124                   *cost += extra_cost->alu.shift;
11125                 }
11126             }
11127
11128           /* We can incorporate zero/sign extend for free.  */
11129           if (GET_CODE (op0) == ZERO_EXTEND
11130               || GET_CODE (op0) == SIGN_EXTEND)
11131             op0 = XEXP (op0, 0);
11132
11133           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
11134           return true;
11135         }
11136       else
11137         {
11138           if (VECTOR_MODE_P (mode))
11139             {
11140               if (speed)
11141                 /* Vector shift (register).  */
11142                 *cost += extra_cost->vect.alu;
11143             }
11144           else
11145             {
11146               if (speed)
11147                 /* LSLV.  */
11148                 *cost += extra_cost->alu.shift_reg;
11149
11150               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11151                   && CONST_INT_P (XEXP (op1, 1))
11152                   && known_eq (INTVAL (XEXP (op1, 1)),
11153                                GET_MODE_BITSIZE (mode) - 1))
11154                 {
11155                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11156                   /* We already demanded XEXP (op1, 0) to be REG_P, so
11157                      don't recurse into it.  */
11158                   return true;
11159                 }
11160             }
11161           return false;  /* All arguments need to be in registers.  */
11162         }
11163
11164     case ROTATE:
11165     case ROTATERT:
11166     case LSHIFTRT:
11167     case ASHIFTRT:
11168       op0 = XEXP (x, 0);
11169       op1 = XEXP (x, 1);
11170
11171       if (CONST_INT_P (op1))
11172         {
11173           /* ASR (immediate) and friends.  */
11174           if (speed)
11175             {
11176               if (VECTOR_MODE_P (mode))
11177                 *cost += extra_cost->vect.alu;
11178               else
11179                 *cost += extra_cost->alu.shift;
11180             }
11181
11182           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
11183           return true;
11184         }
11185       else
11186         {
11187           if (VECTOR_MODE_P (mode))
11188             {
11189               if (speed)
11190                 /* Vector shift (register).  */
11191                 *cost += extra_cost->vect.alu;
11192             }
11193           else
11194             {
11195               if (speed)
11196                 /* ASR (register) and friends.  */
11197                 *cost += extra_cost->alu.shift_reg;
11198
11199               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11200                   && CONST_INT_P (XEXP (op1, 1))
11201                   && known_eq (INTVAL (XEXP (op1, 1)),
11202                                GET_MODE_BITSIZE (mode) - 1))
11203                 {
11204                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11205                   /* We already demanded XEXP (op1, 0) to be REG_P, so
11206                      don't recurse into it.  */
11207                   return true;
11208                 }
11209             }
11210           return false;  /* All arguments need to be in registers.  */
11211         }
11212
11213     case SYMBOL_REF:
11214
11215       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
11216           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
11217         {
11218           /* LDR.  */
11219           if (speed)
11220             *cost += extra_cost->ldst.load;
11221         }
11222       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
11223                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
11224         {
11225           /* ADRP, followed by ADD.  */
11226           *cost += COSTS_N_INSNS (1);
11227           if (speed)
11228             *cost += 2 * extra_cost->alu.arith;
11229         }
11230       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
11231                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11232         {
11233           /* ADR.  */
11234           if (speed)
11235             *cost += extra_cost->alu.arith;
11236         }
11237
11238       if (flag_pic)
11239         {
11240           /* One extra load instruction, after accessing the GOT.  */
11241           *cost += COSTS_N_INSNS (1);
11242           if (speed)
11243             *cost += extra_cost->ldst.load;
11244         }
11245       return true;
11246
11247     case HIGH:
11248     case LO_SUM:
11249       /* ADRP/ADD (immediate).  */
11250       if (speed)
11251         *cost += extra_cost->alu.arith;
11252       return true;
11253
11254     case ZERO_EXTRACT:
11255     case SIGN_EXTRACT:
11256       /* UBFX/SBFX.  */
11257       if (speed)
11258         {
11259           if (VECTOR_MODE_P (mode))
11260             *cost += extra_cost->vect.alu;
11261           else
11262             *cost += extra_cost->alu.bfx;
11263         }
11264
11265       /* We can trust that the immediates used will be correct (there
11266          are no by-register forms), so we need only cost op0.  */
11267       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
11268       return true;
11269
11270     case MULT:
11271       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
11272       /* aarch64_rtx_mult_cost always handles recursion to its
11273          operands.  */
11274       return true;
11275
11276     case MOD:
11277     /* We can expand signed mod by power of 2 using a NEGS, two parallel
11278        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
11279        an unconditional negate.  This case should only ever be reached through
11280        the set_smod_pow2_cheap check in expmed.c.  */
11281       if (CONST_INT_P (XEXP (x, 1))
11282           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
11283           && (mode == SImode || mode == DImode))
11284         {
11285           /* We expand to 4 instructions.  Reset the baseline.  */
11286           *cost = COSTS_N_INSNS (4);
11287
11288           if (speed)
11289             *cost += 2 * extra_cost->alu.logical
11290                      + 2 * extra_cost->alu.arith;
11291
11292           return true;
11293         }
11294
11295     /* Fall-through.  */
11296     case UMOD:
11297       if (speed)
11298         {
11299           /* Slighly prefer UMOD over SMOD.  */
11300           if (VECTOR_MODE_P (mode))
11301             *cost += extra_cost->vect.alu;
11302           else if (GET_MODE_CLASS (mode) == MODE_INT)
11303             *cost += (extra_cost->mult[mode == DImode].add
11304                       + extra_cost->mult[mode == DImode].idiv
11305                       + (code == MOD ? 1 : 0));
11306         }
11307       return false;  /* All arguments need to be in registers.  */
11308
11309     case DIV:
11310     case UDIV:
11311     case SQRT:
11312       if (speed)
11313         {
11314           if (VECTOR_MODE_P (mode))
11315             *cost += extra_cost->vect.alu;
11316           else if (GET_MODE_CLASS (mode) == MODE_INT)
11317             /* There is no integer SQRT, so only DIV and UDIV can get
11318                here.  */
11319             *cost += (extra_cost->mult[mode == DImode].idiv
11320                      /* Slighly prefer UDIV over SDIV.  */
11321                      + (code == DIV ? 1 : 0));
11322           else
11323             *cost += extra_cost->fp[mode == DFmode].div;
11324         }
11325       return false;  /* All arguments need to be in registers.  */
11326
11327     case IF_THEN_ELSE:
11328       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
11329                                          XEXP (x, 2), cost, speed);
11330
11331     case EQ:
11332     case NE:
11333     case GT:
11334     case GTU:
11335     case LT:
11336     case LTU:
11337     case GE:
11338     case GEU:
11339     case LE:
11340     case LEU:
11341
11342       return false; /* All arguments must be in registers.  */
11343
11344     case FMA:
11345       op0 = XEXP (x, 0);
11346       op1 = XEXP (x, 1);
11347       op2 = XEXP (x, 2);
11348
11349       if (speed)
11350         {
11351           if (VECTOR_MODE_P (mode))
11352             *cost += extra_cost->vect.alu;
11353           else
11354             *cost += extra_cost->fp[mode == DFmode].fma;
11355         }
11356
11357       /* FMSUB, FNMADD, and FNMSUB are free.  */
11358       if (GET_CODE (op0) == NEG)
11359         op0 = XEXP (op0, 0);
11360
11361       if (GET_CODE (op2) == NEG)
11362         op2 = XEXP (op2, 0);
11363
11364       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
11365          and the by-element operand as operand 0.  */
11366       if (GET_CODE (op1) == NEG)
11367         op1 = XEXP (op1, 0);
11368
11369       /* Catch vector-by-element operations.  The by-element operand can
11370          either be (vec_duplicate (vec_select (x))) or just
11371          (vec_select (x)), depending on whether we are multiplying by
11372          a vector or a scalar.
11373
11374          Canonicalization is not very good in these cases, FMA4 will put the
11375          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
11376       if (GET_CODE (op0) == VEC_DUPLICATE)
11377         op0 = XEXP (op0, 0);
11378       else if (GET_CODE (op1) == VEC_DUPLICATE)
11379         op1 = XEXP (op1, 0);
11380
11381       if (GET_CODE (op0) == VEC_SELECT)
11382         op0 = XEXP (op0, 0);
11383       else if (GET_CODE (op1) == VEC_SELECT)
11384         op1 = XEXP (op1, 0);
11385
11386       /* If the remaining parameters are not registers,
11387          get the cost to put them into registers.  */
11388       *cost += rtx_cost (op0, mode, FMA, 0, speed);
11389       *cost += rtx_cost (op1, mode, FMA, 1, speed);
11390       *cost += rtx_cost (op2, mode, FMA, 2, speed);
11391       return true;
11392
11393     case FLOAT:
11394     case UNSIGNED_FLOAT:
11395       if (speed)
11396         *cost += extra_cost->fp[mode == DFmode].fromint;
11397       return false;
11398
11399     case FLOAT_EXTEND:
11400       if (speed)
11401         {
11402           if (VECTOR_MODE_P (mode))
11403             {
11404               /*Vector truncate.  */
11405               *cost += extra_cost->vect.alu;
11406             }
11407           else
11408             *cost += extra_cost->fp[mode == DFmode].widen;
11409         }
11410       return false;
11411
11412     case FLOAT_TRUNCATE:
11413       if (speed)
11414         {
11415           if (VECTOR_MODE_P (mode))
11416             {
11417               /*Vector conversion.  */
11418               *cost += extra_cost->vect.alu;
11419             }
11420           else
11421             *cost += extra_cost->fp[mode == DFmode].narrow;
11422         }
11423       return false;
11424
11425     case FIX:
11426     case UNSIGNED_FIX:
11427       x = XEXP (x, 0);
11428       /* Strip the rounding part.  They will all be implemented
11429          by the fcvt* family of instructions anyway.  */
11430       if (GET_CODE (x) == UNSPEC)
11431         {
11432           unsigned int uns_code = XINT (x, 1);
11433
11434           if (uns_code == UNSPEC_FRINTA
11435               || uns_code == UNSPEC_FRINTM
11436               || uns_code == UNSPEC_FRINTN
11437               || uns_code == UNSPEC_FRINTP
11438               || uns_code == UNSPEC_FRINTZ)
11439             x = XVECEXP (x, 0, 0);
11440         }
11441
11442       if (speed)
11443         {
11444           if (VECTOR_MODE_P (mode))
11445             *cost += extra_cost->vect.alu;
11446           else
11447             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
11448         }
11449
11450       /* We can combine fmul by a power of 2 followed by a fcvt into a single
11451          fixed-point fcvt.  */
11452       if (GET_CODE (x) == MULT
11453           && ((VECTOR_MODE_P (mode)
11454                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
11455               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
11456         {
11457           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
11458                              0, speed);
11459           return true;
11460         }
11461
11462       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
11463       return true;
11464
11465     case ABS:
11466       if (VECTOR_MODE_P (mode))
11467         {
11468           /* ABS (vector).  */
11469           if (speed)
11470             *cost += extra_cost->vect.alu;
11471         }
11472       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11473         {
11474           op0 = XEXP (x, 0);
11475
11476           /* FABD, which is analogous to FADD.  */
11477           if (GET_CODE (op0) == MINUS)
11478             {
11479               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
11480               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
11481               if (speed)
11482                 *cost += extra_cost->fp[mode == DFmode].addsub;
11483
11484               return true;
11485             }
11486           /* Simple FABS is analogous to FNEG.  */
11487           if (speed)
11488             *cost += extra_cost->fp[mode == DFmode].neg;
11489         }
11490       else
11491         {
11492           /* Integer ABS will either be split to
11493              two arithmetic instructions, or will be an ABS
11494              (scalar), which we don't model.  */
11495           *cost = COSTS_N_INSNS (2);
11496           if (speed)
11497             *cost += 2 * extra_cost->alu.arith;
11498         }
11499       return false;
11500
11501     case SMAX:
11502     case SMIN:
11503       if (speed)
11504         {
11505           if (VECTOR_MODE_P (mode))
11506             *cost += extra_cost->vect.alu;
11507           else
11508             {
11509               /* FMAXNM/FMINNM/FMAX/FMIN.
11510                  TODO: This may not be accurate for all implementations, but
11511                  we do not model this in the cost tables.  */
11512               *cost += extra_cost->fp[mode == DFmode].addsub;
11513             }
11514         }
11515       return false;
11516
11517     case UNSPEC:
11518       /* The floating point round to integer frint* instructions.  */
11519       if (aarch64_frint_unspec_p (XINT (x, 1)))
11520         {
11521           if (speed)
11522             *cost += extra_cost->fp[mode == DFmode].roundint;
11523
11524           return false;
11525         }
11526
11527       if (XINT (x, 1) == UNSPEC_RBIT)
11528         {
11529           if (speed)
11530             *cost += extra_cost->alu.rev;
11531
11532           return false;
11533         }
11534       break;
11535
11536     case TRUNCATE:
11537
11538       /* Decompose <su>muldi3_highpart.  */
11539       if (/* (truncate:DI  */
11540           mode == DImode
11541           /*   (lshiftrt:TI  */
11542           && GET_MODE (XEXP (x, 0)) == TImode
11543           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
11544           /*      (mult:TI  */
11545           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
11546           /*        (ANY_EXTEND:TI (reg:DI))
11547                     (ANY_EXTEND:TI (reg:DI)))  */
11548           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
11549                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
11550               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
11551                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
11552           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
11553           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
11554           /*     (const_int 64)  */
11555           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
11556           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
11557         {
11558           /* UMULH/SMULH.  */
11559           if (speed)
11560             *cost += extra_cost->mult[mode == DImode].extend;
11561           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
11562                              mode, MULT, 0, speed);
11563           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
11564                              mode, MULT, 1, speed);
11565           return true;
11566         }
11567
11568       /* Fall through.  */
11569     default:
11570       break;
11571     }
11572
11573   if (dump_file
11574       && flag_aarch64_verbose_cost)
11575     fprintf (dump_file,
11576       "\nFailed to cost RTX.  Assuming default cost.\n");
11577
11578   return true;
11579 }
11580
11581 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
11582    calculated for X.  This cost is stored in *COST.  Returns true
11583    if the total cost of X was calculated.  */
11584 static bool
11585 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
11586                    int param, int *cost, bool speed)
11587 {
11588   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
11589
11590   if (dump_file
11591       && flag_aarch64_verbose_cost)
11592     {
11593       print_rtl_single (dump_file, x);
11594       fprintf (dump_file, "\n%s cost: %d (%s)\n",
11595                speed ? "Hot" : "Cold",
11596                *cost, result ? "final" : "partial");
11597     }
11598
11599   return result;
11600 }
11601
11602 static int
11603 aarch64_register_move_cost (machine_mode mode,
11604                             reg_class_t from_i, reg_class_t to_i)
11605 {
11606   enum reg_class from = (enum reg_class) from_i;
11607   enum reg_class to = (enum reg_class) to_i;
11608   const struct cpu_regmove_cost *regmove_cost
11609     = aarch64_tune_params.regmove_cost;
11610
11611   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
11612   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
11613     to = GENERAL_REGS;
11614
11615   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
11616     from = GENERAL_REGS;
11617
11618   /* Moving between GPR and stack cost is the same as GP2GP.  */
11619   if ((from == GENERAL_REGS && to == STACK_REG)
11620       || (to == GENERAL_REGS && from == STACK_REG))
11621     return regmove_cost->GP2GP;
11622
11623   /* To/From the stack register, we move via the gprs.  */
11624   if (to == STACK_REG || from == STACK_REG)
11625     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
11626             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
11627
11628   if (known_eq (GET_MODE_SIZE (mode), 16))
11629     {
11630       /* 128-bit operations on general registers require 2 instructions.  */
11631       if (from == GENERAL_REGS && to == GENERAL_REGS)
11632         return regmove_cost->GP2GP * 2;
11633       else if (from == GENERAL_REGS)
11634         return regmove_cost->GP2FP * 2;
11635       else if (to == GENERAL_REGS)
11636         return regmove_cost->FP2GP * 2;
11637
11638       /* When AdvSIMD instructions are disabled it is not possible to move
11639          a 128-bit value directly between Q registers.  This is handled in
11640          secondary reload.  A general register is used as a scratch to move
11641          the upper DI value and the lower DI value is moved directly,
11642          hence the cost is the sum of three moves. */
11643       if (! TARGET_SIMD)
11644         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
11645
11646       return regmove_cost->FP2FP;
11647     }
11648
11649   if (from == GENERAL_REGS && to == GENERAL_REGS)
11650     return regmove_cost->GP2GP;
11651   else if (from == GENERAL_REGS)
11652     return regmove_cost->GP2FP;
11653   else if (to == GENERAL_REGS)
11654     return regmove_cost->FP2GP;
11655
11656   return regmove_cost->FP2FP;
11657 }
11658
11659 static int
11660 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
11661                           reg_class_t rclass ATTRIBUTE_UNUSED,
11662                           bool in ATTRIBUTE_UNUSED)
11663 {
11664   return aarch64_tune_params.memmov_cost;
11665 }
11666
11667 /* Implement TARGET_INIT_BUILTINS.  */
11668 static void
11669 aarch64_init_builtins ()
11670 {
11671   aarch64_general_init_builtins ();
11672 }
11673
11674 /* Implement TARGET_FOLD_BUILTIN.  */
11675 static tree
11676 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
11677 {
11678   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
11679   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
11680   tree type = TREE_TYPE (TREE_TYPE (fndecl));
11681   switch (code & AARCH64_BUILTIN_CLASS)
11682     {
11683     case AARCH64_BUILTIN_GENERAL:
11684       return aarch64_general_fold_builtin (subcode, type, nargs, args);
11685     }
11686   gcc_unreachable ();
11687 }
11688
11689 /* Implement TARGET_GIMPLE_FOLD_BUILTIN.  */
11690 static bool
11691 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
11692 {
11693   gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
11694   tree fndecl = gimple_call_fndecl (stmt);
11695   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
11696   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
11697   gimple *new_stmt = NULL;
11698   switch (code & AARCH64_BUILTIN_CLASS)
11699     {
11700     case AARCH64_BUILTIN_GENERAL:
11701       new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt);
11702       break;
11703     }
11704
11705   if (!new_stmt)
11706     return false;
11707
11708   gsi_replace (gsi, new_stmt, true);
11709   return true;
11710 }
11711
11712 /* Implement TARGET_EXPAND_BUILTIN.  */
11713 static rtx
11714 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int)
11715 {
11716   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
11717   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
11718   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
11719   switch (code & AARCH64_BUILTIN_CLASS)
11720     {
11721     case AARCH64_BUILTIN_GENERAL:
11722       return aarch64_general_expand_builtin (subcode, exp, target);
11723     }
11724   gcc_unreachable ();
11725 }
11726
11727 /* Implement TARGET_BUILTIN_DECL.  */
11728 static tree
11729 aarch64_builtin_decl (unsigned int code, bool initialize_p)
11730 {
11731   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
11732   switch (code & AARCH64_BUILTIN_CLASS)
11733     {
11734     case AARCH64_BUILTIN_GENERAL:
11735       return aarch64_general_builtin_decl (subcode, initialize_p);
11736     }
11737   gcc_unreachable ();
11738 }
11739
11740 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
11741    to optimize 1.0/sqrt.  */
11742
11743 static bool
11744 use_rsqrt_p (machine_mode mode)
11745 {
11746   return (!flag_trapping_math
11747           && flag_unsafe_math_optimizations
11748           && ((aarch64_tune_params.approx_modes->recip_sqrt
11749                & AARCH64_APPROX_MODE (mode))
11750               || flag_mrecip_low_precision_sqrt));
11751 }
11752
11753 /* Function to decide when to use the approximate reciprocal square root
11754    builtin.  */
11755
11756 static tree
11757 aarch64_builtin_reciprocal (tree fndecl)
11758 {
11759   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
11760
11761   if (!use_rsqrt_p (mode))
11762     return NULL_TREE;
11763   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
11764   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
11765   switch (code & AARCH64_BUILTIN_CLASS)
11766     {
11767     case AARCH64_BUILTIN_GENERAL:
11768       return aarch64_general_builtin_rsqrt (subcode);
11769     }
11770   gcc_unreachable ();
11771 }
11772
11773 /* Emit instruction sequence to compute either the approximate square root
11774    or its approximate reciprocal, depending on the flag RECP, and return
11775    whether the sequence was emitted or not.  */
11776
11777 bool
11778 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
11779 {
11780   machine_mode mode = GET_MODE (dst);
11781
11782   if (GET_MODE_INNER (mode) == HFmode)
11783     {
11784       gcc_assert (!recp);
11785       return false;
11786     }
11787
11788   if (!recp)
11789     {
11790       if (!(flag_mlow_precision_sqrt
11791             || (aarch64_tune_params.approx_modes->sqrt
11792                 & AARCH64_APPROX_MODE (mode))))
11793         return false;
11794
11795       if (flag_finite_math_only
11796           || flag_trapping_math
11797           || !flag_unsafe_math_optimizations
11798           || optimize_function_for_size_p (cfun))
11799         return false;
11800     }
11801   else
11802     /* Caller assumes we cannot fail.  */
11803     gcc_assert (use_rsqrt_p (mode));
11804
11805   machine_mode mmsk = mode_for_int_vector (mode).require ();
11806   rtx xmsk = gen_reg_rtx (mmsk);
11807   if (!recp)
11808     /* When calculating the approximate square root, compare the
11809        argument with 0.0 and create a mask.  */
11810     emit_insn (gen_rtx_SET (xmsk,
11811                             gen_rtx_NEG (mmsk,
11812                                          gen_rtx_EQ (mmsk, src,
11813                                                      CONST0_RTX (mode)))));
11814
11815   /* Estimate the approximate reciprocal square root.  */
11816   rtx xdst = gen_reg_rtx (mode);
11817   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
11818
11819   /* Iterate over the series twice for SF and thrice for DF.  */
11820   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11821
11822   /* Optionally iterate over the series once less for faster performance
11823      while sacrificing the accuracy.  */
11824   if ((recp && flag_mrecip_low_precision_sqrt)
11825       || (!recp && flag_mlow_precision_sqrt))
11826     iterations--;
11827
11828   /* Iterate over the series to calculate the approximate reciprocal square
11829      root.  */
11830   rtx x1 = gen_reg_rtx (mode);
11831   while (iterations--)
11832     {
11833       rtx x2 = gen_reg_rtx (mode);
11834       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
11835
11836       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
11837
11838       if (iterations > 0)
11839         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
11840     }
11841
11842   if (!recp)
11843     {
11844       /* Qualify the approximate reciprocal square root when the argument is
11845          0.0 by squashing the intermediary result to 0.0.  */
11846       rtx xtmp = gen_reg_rtx (mmsk);
11847       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
11848                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
11849       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
11850
11851       /* Calculate the approximate square root.  */
11852       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
11853     }
11854
11855   /* Finalize the approximation.  */
11856   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
11857
11858   return true;
11859 }
11860
11861 /* Emit the instruction sequence to compute the approximation for the division
11862    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
11863
11864 bool
11865 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
11866 {
11867   machine_mode mode = GET_MODE (quo);
11868
11869   if (GET_MODE_INNER (mode) == HFmode)
11870     return false;
11871
11872   bool use_approx_division_p = (flag_mlow_precision_div
11873                                 || (aarch64_tune_params.approx_modes->division
11874                                     & AARCH64_APPROX_MODE (mode)));
11875
11876   if (!flag_finite_math_only
11877       || flag_trapping_math
11878       || !flag_unsafe_math_optimizations
11879       || optimize_function_for_size_p (cfun)
11880       || !use_approx_division_p)
11881     return false;
11882
11883   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
11884     return false;
11885
11886   /* Estimate the approximate reciprocal.  */
11887   rtx xrcp = gen_reg_rtx (mode);
11888   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
11889
11890   /* Iterate over the series twice for SF and thrice for DF.  */
11891   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11892
11893   /* Optionally iterate over the series once less for faster performance,
11894      while sacrificing the accuracy.  */
11895   if (flag_mlow_precision_div)
11896     iterations--;
11897
11898   /* Iterate over the series to calculate the approximate reciprocal.  */
11899   rtx xtmp = gen_reg_rtx (mode);
11900   while (iterations--)
11901     {
11902       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
11903
11904       if (iterations > 0)
11905         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
11906     }
11907
11908   if (num != CONST1_RTX (mode))
11909     {
11910       /* As the approximate reciprocal of DEN is already calculated, only
11911          calculate the approximate division when NUM is not 1.0.  */
11912       rtx xnum = force_reg (mode, num);
11913       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
11914     }
11915
11916   /* Finalize the approximation.  */
11917   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
11918   return true;
11919 }
11920
11921 /* Return the number of instructions that can be issued per cycle.  */
11922 static int
11923 aarch64_sched_issue_rate (void)
11924 {
11925   return aarch64_tune_params.issue_rate;
11926 }
11927
11928 /* Implement TARGET_SCHED_VARIABLE_ISSUE.  */
11929 static int
11930 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
11931 {
11932   if (DEBUG_INSN_P (insn))
11933     return more;
11934
11935   rtx_code code = GET_CODE (PATTERN (insn));
11936   if (code == USE || code == CLOBBER)
11937     return more;
11938
11939   if (get_attr_type (insn) == TYPE_NO_INSN)
11940     return more;
11941
11942   return more - 1;
11943 }
11944
11945 static int
11946 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
11947 {
11948   int issue_rate = aarch64_sched_issue_rate ();
11949
11950   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
11951 }
11952
11953
11954 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
11955    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
11956    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
11957
11958 static int
11959 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
11960                                                     int ready_index)
11961 {
11962   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
11963 }
11964
11965
11966 /* Vectorizer cost model target hooks.  */
11967
11968 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
11969 static int
11970 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
11971                                     tree vectype,
11972                                     int misalign ATTRIBUTE_UNUSED)
11973 {
11974   unsigned elements;
11975   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
11976   bool fp = false;
11977
11978   if (vectype != NULL)
11979     fp = FLOAT_TYPE_P (vectype);
11980
11981   switch (type_of_cost)
11982     {
11983       case scalar_stmt:
11984         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
11985
11986       case scalar_load:
11987         return costs->scalar_load_cost;
11988
11989       case scalar_store:
11990         return costs->scalar_store_cost;
11991
11992       case vector_stmt:
11993         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11994
11995       case vector_load:
11996         return costs->vec_align_load_cost;
11997
11998       case vector_store:
11999         return costs->vec_store_cost;
12000
12001       case vec_to_scalar:
12002         return costs->vec_to_scalar_cost;
12003
12004       case scalar_to_vec:
12005         return costs->scalar_to_vec_cost;
12006
12007       case unaligned_load:
12008       case vector_gather_load:
12009         return costs->vec_unalign_load_cost;
12010
12011       case unaligned_store:
12012       case vector_scatter_store:
12013         return costs->vec_unalign_store_cost;
12014
12015       case cond_branch_taken:
12016         return costs->cond_taken_branch_cost;
12017
12018       case cond_branch_not_taken:
12019         return costs->cond_not_taken_branch_cost;
12020
12021       case vec_perm:
12022         return costs->vec_permute_cost;
12023
12024       case vec_promote_demote:
12025         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
12026
12027       case vec_construct:
12028         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
12029         return elements / 2 + 1;
12030
12031       default:
12032         gcc_unreachable ();
12033     }
12034 }
12035
12036 /* Implement targetm.vectorize.add_stmt_cost.  */
12037 static unsigned
12038 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
12039                        struct _stmt_vec_info *stmt_info, int misalign,
12040                        enum vect_cost_model_location where)
12041 {
12042   unsigned *cost = (unsigned *) data;
12043   unsigned retval = 0;
12044
12045   if (flag_vect_cost_model)
12046     {
12047       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
12048       int stmt_cost =
12049             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
12050
12051       /* Statements in an inner loop relative to the loop being
12052          vectorized are weighted more heavily.  The value here is
12053          arbitrary and could potentially be improved with analysis.  */
12054       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
12055         count *= 50; /*  FIXME  */
12056
12057       retval = (unsigned) (count * stmt_cost);
12058       cost[where] += retval;
12059     }
12060
12061   return retval;
12062 }
12063
12064 static void initialize_aarch64_code_model (struct gcc_options *);
12065
12066 /* Parse the TO_PARSE string and put the architecture struct that it
12067    selects into RES and the architectural features into ISA_FLAGS.
12068    Return an aarch64_parse_opt_result describing the parse result.
12069    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
12070    When the TO_PARSE string contains an invalid extension,
12071    a copy of the string is created and stored to INVALID_EXTENSION.  */
12072
12073 static enum aarch64_parse_opt_result
12074 aarch64_parse_arch (const char *to_parse, const struct processor **res,
12075                     uint64_t *isa_flags, std::string *invalid_extension)
12076 {
12077   const char *ext;
12078   const struct processor *arch;
12079   size_t len;
12080
12081   ext = strchr (to_parse, '+');
12082
12083   if (ext != NULL)
12084     len = ext - to_parse;
12085   else
12086     len = strlen (to_parse);
12087
12088   if (len == 0)
12089     return AARCH64_PARSE_MISSING_ARG;
12090
12091
12092   /* Loop through the list of supported ARCHes to find a match.  */
12093   for (arch = all_architectures; arch->name != NULL; arch++)
12094     {
12095       if (strlen (arch->name) == len
12096           && strncmp (arch->name, to_parse, len) == 0)
12097         {
12098           uint64_t isa_temp = arch->flags;
12099
12100           if (ext != NULL)
12101             {
12102               /* TO_PARSE string contains at least one extension.  */
12103               enum aarch64_parse_opt_result ext_res
12104                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
12105
12106               if (ext_res != AARCH64_PARSE_OK)
12107                 return ext_res;
12108             }
12109           /* Extension parsing was successful.  Confirm the result
12110              arch and ISA flags.  */
12111           *res = arch;
12112           *isa_flags = isa_temp;
12113           return AARCH64_PARSE_OK;
12114         }
12115     }
12116
12117   /* ARCH name not found in list.  */
12118   return AARCH64_PARSE_INVALID_ARG;
12119 }
12120
12121 /* Parse the TO_PARSE string and put the result tuning in RES and the
12122    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
12123    describing the parse result.  If there is an error parsing, RES and
12124    ISA_FLAGS are left unchanged.
12125    When the TO_PARSE string contains an invalid extension,
12126    a copy of the string is created and stored to INVALID_EXTENSION.  */
12127
12128 static enum aarch64_parse_opt_result
12129 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
12130                    uint64_t *isa_flags, std::string *invalid_extension)
12131 {
12132   const char *ext;
12133   const struct processor *cpu;
12134   size_t len;
12135
12136   ext = strchr (to_parse, '+');
12137
12138   if (ext != NULL)
12139     len = ext - to_parse;
12140   else
12141     len = strlen (to_parse);
12142
12143   if (len == 0)
12144     return AARCH64_PARSE_MISSING_ARG;
12145
12146
12147   /* Loop through the list of supported CPUs to find a match.  */
12148   for (cpu = all_cores; cpu->name != NULL; cpu++)
12149     {
12150       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
12151         {
12152           uint64_t isa_temp = cpu->flags;
12153
12154
12155           if (ext != NULL)
12156             {
12157               /* TO_PARSE string contains at least one extension.  */
12158               enum aarch64_parse_opt_result ext_res
12159                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
12160
12161               if (ext_res != AARCH64_PARSE_OK)
12162                 return ext_res;
12163             }
12164           /* Extension parsing was successfull.  Confirm the result
12165              cpu and ISA flags.  */
12166           *res = cpu;
12167           *isa_flags = isa_temp;
12168           return AARCH64_PARSE_OK;
12169         }
12170     }
12171
12172   /* CPU name not found in list.  */
12173   return AARCH64_PARSE_INVALID_ARG;
12174 }
12175
12176 /* Parse the TO_PARSE string and put the cpu it selects into RES.
12177    Return an aarch64_parse_opt_result describing the parse result.
12178    If the parsing fails the RES does not change.  */
12179
12180 static enum aarch64_parse_opt_result
12181 aarch64_parse_tune (const char *to_parse, const struct processor **res)
12182 {
12183   const struct processor *cpu;
12184
12185   /* Loop through the list of supported CPUs to find a match.  */
12186   for (cpu = all_cores; cpu->name != NULL; cpu++)
12187     {
12188       if (strcmp (cpu->name, to_parse) == 0)
12189         {
12190           *res = cpu;
12191           return AARCH64_PARSE_OK;
12192         }
12193     }
12194
12195   /* CPU name not found in list.  */
12196   return AARCH64_PARSE_INVALID_ARG;
12197 }
12198
12199 /* Parse TOKEN, which has length LENGTH to see if it is an option
12200    described in FLAG.  If it is, return the index bit for that fusion type.
12201    If not, error (printing OPTION_NAME) and return zero.  */
12202
12203 static unsigned int
12204 aarch64_parse_one_option_token (const char *token,
12205                                 size_t length,
12206                                 const struct aarch64_flag_desc *flag,
12207                                 const char *option_name)
12208 {
12209   for (; flag->name != NULL; flag++)
12210     {
12211       if (length == strlen (flag->name)
12212           && !strncmp (flag->name, token, length))
12213         return flag->flag;
12214     }
12215
12216   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
12217   return 0;
12218 }
12219
12220 /* Parse OPTION which is a comma-separated list of flags to enable.
12221    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
12222    default state we inherit from the CPU tuning structures.  OPTION_NAME
12223    gives the top-level option we are parsing in the -moverride string,
12224    for use in error messages.  */
12225
12226 static unsigned int
12227 aarch64_parse_boolean_options (const char *option,
12228                                const struct aarch64_flag_desc *flags,
12229                                unsigned int initial_state,
12230                                const char *option_name)
12231 {
12232   const char separator = '.';
12233   const char* specs = option;
12234   const char* ntoken = option;
12235   unsigned int found_flags = initial_state;
12236
12237   while ((ntoken = strchr (specs, separator)))
12238     {
12239       size_t token_length = ntoken - specs;
12240       unsigned token_ops = aarch64_parse_one_option_token (specs,
12241                                                            token_length,
12242                                                            flags,
12243                                                            option_name);
12244       /* If we find "none" (or, for simplicity's sake, an error) anywhere
12245          in the token stream, reset the supported operations.  So:
12246
12247            adrp+add.cmp+branch.none.adrp+add
12248
12249            would have the result of turning on only adrp+add fusion.  */
12250       if (!token_ops)
12251         found_flags = 0;
12252
12253       found_flags |= token_ops;
12254       specs = ++ntoken;
12255     }
12256
12257   /* We ended with a comma, print something.  */
12258   if (!(*specs))
12259     {
12260       error ("%s string ill-formed\n", option_name);
12261       return 0;
12262     }
12263
12264   /* We still have one more token to parse.  */
12265   size_t token_length = strlen (specs);
12266   unsigned token_ops = aarch64_parse_one_option_token (specs,
12267                                                        token_length,
12268                                                        flags,
12269                                                        option_name);
12270    if (!token_ops)
12271      found_flags = 0;
12272
12273   found_flags |= token_ops;
12274   return found_flags;
12275 }
12276
12277 /* Support for overriding instruction fusion.  */
12278
12279 static void
12280 aarch64_parse_fuse_string (const char *fuse_string,
12281                             struct tune_params *tune)
12282 {
12283   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
12284                                                      aarch64_fusible_pairs,
12285                                                      tune->fusible_ops,
12286                                                      "fuse=");
12287 }
12288
12289 /* Support for overriding other tuning flags.  */
12290
12291 static void
12292 aarch64_parse_tune_string (const char *tune_string,
12293                             struct tune_params *tune)
12294 {
12295   tune->extra_tuning_flags
12296     = aarch64_parse_boolean_options (tune_string,
12297                                      aarch64_tuning_flags,
12298                                      tune->extra_tuning_flags,
12299                                      "tune=");
12300 }
12301
12302 /* Parse the sve_width tuning moverride string in TUNE_STRING.
12303    Accept the valid SVE vector widths allowed by
12304    aarch64_sve_vector_bits_enum and use it to override sve_width
12305    in TUNE.  */
12306
12307 static void
12308 aarch64_parse_sve_width_string (const char *tune_string,
12309                                 struct tune_params *tune)
12310 {
12311   int width = -1;
12312
12313   int n = sscanf (tune_string, "%d", &width);
12314   if (n == EOF)
12315     {
12316       error ("invalid format for sve_width");
12317       return;
12318     }
12319   switch (width)
12320     {
12321     case SVE_128:
12322     case SVE_256:
12323     case SVE_512:
12324     case SVE_1024:
12325     case SVE_2048:
12326       break;
12327     default:
12328       error ("invalid sve_width value: %d", width);
12329     }
12330   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
12331 }
12332
12333 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
12334    we understand.  If it is, extract the option string and handoff to
12335    the appropriate function.  */
12336
12337 void
12338 aarch64_parse_one_override_token (const char* token,
12339                                   size_t length,
12340                                   struct tune_params *tune)
12341 {
12342   const struct aarch64_tuning_override_function *fn
12343     = aarch64_tuning_override_functions;
12344
12345   const char *option_part = strchr (token, '=');
12346   if (!option_part)
12347     {
12348       error ("tuning string missing in option (%s)", token);
12349       return;
12350     }
12351
12352   /* Get the length of the option name.  */
12353   length = option_part - token;
12354   /* Skip the '=' to get to the option string.  */
12355   option_part++;
12356
12357   for (; fn->name != NULL; fn++)
12358     {
12359       if (!strncmp (fn->name, token, length))
12360         {
12361           fn->parse_override (option_part, tune);
12362           return;
12363         }
12364     }
12365
12366   error ("unknown tuning option (%s)",token);
12367   return;
12368 }
12369
12370 /* A checking mechanism for the implementation of the tls size.  */
12371
12372 static void
12373 initialize_aarch64_tls_size (struct gcc_options *opts)
12374 {
12375   if (aarch64_tls_size == 0)
12376     aarch64_tls_size = 24;
12377
12378   switch (opts->x_aarch64_cmodel_var)
12379     {
12380     case AARCH64_CMODEL_TINY:
12381       /* Both the default and maximum TLS size allowed under tiny is 1M which
12382          needs two instructions to address, so we clamp the size to 24.  */
12383       if (aarch64_tls_size > 24)
12384         aarch64_tls_size = 24;
12385       break;
12386     case AARCH64_CMODEL_SMALL:
12387       /* The maximum TLS size allowed under small is 4G.  */
12388       if (aarch64_tls_size > 32)
12389         aarch64_tls_size = 32;
12390       break;
12391     case AARCH64_CMODEL_LARGE:
12392       /* The maximum TLS size allowed under large is 16E.
12393          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
12394       if (aarch64_tls_size > 48)
12395         aarch64_tls_size = 48;
12396       break;
12397     default:
12398       gcc_unreachable ();
12399     }
12400
12401   return;
12402 }
12403
12404 /* Parse STRING looking for options in the format:
12405      string     :: option:string
12406      option     :: name=substring
12407      name       :: {a-z}
12408      substring  :: defined by option.  */
12409
12410 static void
12411 aarch64_parse_override_string (const char* input_string,
12412                                struct tune_params* tune)
12413 {
12414   const char separator = ':';
12415   size_t string_length = strlen (input_string) + 1;
12416   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
12417   char *string = string_root;
12418   strncpy (string, input_string, string_length);
12419   string[string_length - 1] = '\0';
12420
12421   char* ntoken = string;
12422
12423   while ((ntoken = strchr (string, separator)))
12424     {
12425       size_t token_length = ntoken - string;
12426       /* Make this substring look like a string.  */
12427       *ntoken = '\0';
12428       aarch64_parse_one_override_token (string, token_length, tune);
12429       string = ++ntoken;
12430     }
12431
12432   /* One last option to parse.  */
12433   aarch64_parse_one_override_token (string, strlen (string), tune);
12434   free (string_root);
12435 }
12436
12437
12438 static void
12439 aarch64_override_options_after_change_1 (struct gcc_options *opts)
12440 {
12441   if (accepted_branch_protection_string)
12442     {
12443       opts->x_aarch64_branch_protection_string
12444         = xstrdup (accepted_branch_protection_string);
12445     }
12446
12447   /* PR 70044: We have to be careful about being called multiple times for the
12448      same function.  This means all changes should be repeatable.  */
12449
12450   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
12451      Disable the frame pointer flag so the mid-end will not use a frame
12452      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
12453      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
12454      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
12455   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
12456   if (opts->x_flag_omit_frame_pointer == 0)
12457     opts->x_flag_omit_frame_pointer = 2;
12458
12459   /* If not optimizing for size, set the default
12460      alignment to what the target wants.  */
12461   if (!opts->x_optimize_size)
12462     {
12463       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
12464         opts->x_str_align_loops = aarch64_tune_params.loop_align;
12465       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
12466         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
12467       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
12468         opts->x_str_align_functions = aarch64_tune_params.function_align;
12469     }
12470
12471   /* We default to no pc-relative literal loads.  */
12472
12473   aarch64_pcrelative_literal_loads = false;
12474
12475   /* If -mpc-relative-literal-loads is set on the command line, this
12476      implies that the user asked for PC relative literal loads.  */
12477   if (opts->x_pcrelative_literal_loads == 1)
12478     aarch64_pcrelative_literal_loads = true;
12479
12480   /* In the tiny memory model it makes no sense to disallow PC relative
12481      literal pool loads.  */
12482   if (aarch64_cmodel == AARCH64_CMODEL_TINY
12483       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
12484     aarch64_pcrelative_literal_loads = true;
12485
12486   /* When enabling the lower precision Newton series for the square root, also
12487      enable it for the reciprocal square root, since the latter is an
12488      intermediary step for the former.  */
12489   if (flag_mlow_precision_sqrt)
12490     flag_mrecip_low_precision_sqrt = true;
12491 }
12492
12493 /* 'Unpack' up the internal tuning structs and update the options
12494     in OPTS.  The caller must have set up selected_tune and selected_arch
12495     as all the other target-specific codegen decisions are
12496     derived from them.  */
12497
12498 void
12499 aarch64_override_options_internal (struct gcc_options *opts)
12500 {
12501   aarch64_tune_flags = selected_tune->flags;
12502   aarch64_tune = selected_tune->sched_core;
12503   /* Make a copy of the tuning parameters attached to the core, which
12504      we may later overwrite.  */
12505   aarch64_tune_params = *(selected_tune->tune);
12506   aarch64_architecture_version = selected_arch->architecture_version;
12507
12508   if (opts->x_aarch64_override_tune_string)
12509     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
12510                                   &aarch64_tune_params);
12511
12512   /* This target defaults to strict volatile bitfields.  */
12513   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
12514     opts->x_flag_strict_volatile_bitfields = 1;
12515
12516   if (aarch64_stack_protector_guard == SSP_GLOBAL
12517       && opts->x_aarch64_stack_protector_guard_offset_str)
12518     {
12519       error ("incompatible options %<-mstack-protector-guard=global%> and "
12520              "%<-mstack-protector-guard-offset=%s%>",
12521              aarch64_stack_protector_guard_offset_str);
12522     }
12523
12524   if (aarch64_stack_protector_guard == SSP_SYSREG
12525       && !(opts->x_aarch64_stack_protector_guard_offset_str
12526            && opts->x_aarch64_stack_protector_guard_reg_str))
12527     {
12528       error ("both %<-mstack-protector-guard-offset%> and "
12529              "%<-mstack-protector-guard-reg%> must be used "
12530              "with %<-mstack-protector-guard=sysreg%>");
12531     }
12532
12533   if (opts->x_aarch64_stack_protector_guard_reg_str)
12534     {
12535       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
12536           error ("specify a system register with a small string length.");
12537     }
12538
12539   if (opts->x_aarch64_stack_protector_guard_offset_str)
12540     {
12541       char *end;
12542       const char *str = aarch64_stack_protector_guard_offset_str;
12543       errno = 0;
12544       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
12545       if (!*str || *end || errno)
12546         error ("%qs is not a valid offset in %qs", str,
12547                "-mstack-protector-guard-offset=");
12548       aarch64_stack_protector_guard_offset = offs;
12549     }
12550
12551   initialize_aarch64_code_model (opts);
12552   initialize_aarch64_tls_size (opts);
12553
12554   int queue_depth = 0;
12555   switch (aarch64_tune_params.autoprefetcher_model)
12556     {
12557       case tune_params::AUTOPREFETCHER_OFF:
12558         queue_depth = -1;
12559         break;
12560       case tune_params::AUTOPREFETCHER_WEAK:
12561         queue_depth = 0;
12562         break;
12563       case tune_params::AUTOPREFETCHER_STRONG:
12564         queue_depth = max_insn_queue_index + 1;
12565         break;
12566       default:
12567         gcc_unreachable ();
12568     }
12569
12570   /* We don't mind passing in global_options_set here as we don't use
12571      the *options_set structs anyway.  */
12572   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
12573                          queue_depth,
12574                          opts->x_param_values,
12575                          global_options_set.x_param_values);
12576
12577   /* Set up parameters to be used in prefetching algorithm.  Do not
12578      override the defaults unless we are tuning for a core we have
12579      researched values for.  */
12580   if (aarch64_tune_params.prefetch->num_slots > 0)
12581     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
12582                            aarch64_tune_params.prefetch->num_slots,
12583                            opts->x_param_values,
12584                            global_options_set.x_param_values);
12585   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
12586     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
12587                            aarch64_tune_params.prefetch->l1_cache_size,
12588                            opts->x_param_values,
12589                            global_options_set.x_param_values);
12590   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
12591     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
12592                            aarch64_tune_params.prefetch->l1_cache_line_size,
12593                            opts->x_param_values,
12594                            global_options_set.x_param_values);
12595   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
12596     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
12597                            aarch64_tune_params.prefetch->l2_cache_size,
12598                            opts->x_param_values,
12599                            global_options_set.x_param_values);
12600   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
12601     maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
12602                            0,
12603                            opts->x_param_values,
12604                            global_options_set.x_param_values);
12605   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
12606     maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
12607                            aarch64_tune_params.prefetch->minimum_stride,
12608                            opts->x_param_values,
12609                            global_options_set.x_param_values);
12610
12611   /* Use the alternative scheduling-pressure algorithm by default.  */
12612   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
12613                          opts->x_param_values,
12614                          global_options_set.x_param_values);
12615
12616   /* If the user hasn't changed it via configure then set the default to 64 KB
12617      for the backend.  */
12618   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
12619                          DEFAULT_STK_CLASH_GUARD_SIZE == 0
12620                            ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
12621                          opts->x_param_values,
12622                          global_options_set.x_param_values);
12623
12624   /* Validate the guard size.  */
12625   int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
12626
12627   /* Enforce that interval is the same size as size so the mid-end does the
12628      right thing.  */
12629   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
12630                          guard_size,
12631                          opts->x_param_values,
12632                          global_options_set.x_param_values);
12633
12634   /* The maybe_set calls won't update the value if the user has explicitly set
12635      one.  Which means we need to validate that probing interval and guard size
12636      are equal.  */
12637   int probe_interval
12638     = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
12639   if (guard_size != probe_interval)
12640     error ("stack clash guard size %<%d%> must be equal to probing interval "
12641            "%<%d%>", guard_size, probe_interval);
12642
12643   /* Enable sw prefetching at specified optimization level for
12644      CPUS that have prefetch.  Lower optimization level threshold by 1
12645      when profiling is enabled.  */
12646   if (opts->x_flag_prefetch_loop_arrays < 0
12647       && !opts->x_optimize_size
12648       && aarch64_tune_params.prefetch->default_opt_level >= 0
12649       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
12650     opts->x_flag_prefetch_loop_arrays = 1;
12651
12652   if (opts->x_aarch64_arch_string == NULL)
12653     opts->x_aarch64_arch_string = selected_arch->name;
12654   if (opts->x_aarch64_cpu_string == NULL)
12655     opts->x_aarch64_cpu_string = selected_cpu->name;
12656   if (opts->x_aarch64_tune_string == NULL)
12657     opts->x_aarch64_tune_string = selected_tune->name;
12658
12659   aarch64_override_options_after_change_1 (opts);
12660 }
12661
12662 /* Print a hint with a suggestion for a core or architecture name that
12663    most closely resembles what the user passed in STR.  ARCH is true if
12664    the user is asking for an architecture name.  ARCH is false if the user
12665    is asking for a core name.  */
12666
12667 static void
12668 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
12669 {
12670   auto_vec<const char *> candidates;
12671   const struct processor *entry = arch ? all_architectures : all_cores;
12672   for (; entry->name != NULL; entry++)
12673     candidates.safe_push (entry->name);
12674
12675 #ifdef HAVE_LOCAL_CPU_DETECT
12676   /* Add also "native" as possible value.  */
12677   if (arch)
12678     candidates.safe_push ("native");
12679 #endif
12680
12681   char *s;
12682   const char *hint = candidates_list_and_hint (str, s, candidates);
12683   if (hint)
12684     inform (input_location, "valid arguments are: %s;"
12685                              " did you mean %qs?", s, hint);
12686   else
12687     inform (input_location, "valid arguments are: %s", s);
12688
12689   XDELETEVEC (s);
12690 }
12691
12692 /* Print a hint with a suggestion for a core name that most closely resembles
12693    what the user passed in STR.  */
12694
12695 inline static void
12696 aarch64_print_hint_for_core (const char *str)
12697 {
12698   aarch64_print_hint_for_core_or_arch (str, false);
12699 }
12700
12701 /* Print a hint with a suggestion for an architecture name that most closely
12702    resembles what the user passed in STR.  */
12703
12704 inline static void
12705 aarch64_print_hint_for_arch (const char *str)
12706 {
12707   aarch64_print_hint_for_core_or_arch (str, true);
12708 }
12709
12710
12711 /* Print a hint with a suggestion for an extension name
12712    that most closely resembles what the user passed in STR.  */
12713
12714 void
12715 aarch64_print_hint_for_extensions (const std::string &str)
12716 {
12717   auto_vec<const char *> candidates;
12718   aarch64_get_all_extension_candidates (&candidates);
12719   char *s;
12720   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
12721   if (hint)
12722     inform (input_location, "valid arguments are: %s;"
12723                              " did you mean %qs?", s, hint);
12724   else
12725     inform (input_location, "valid arguments are: %s;", s);
12726
12727   XDELETEVEC (s);
12728 }
12729
12730 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
12731    specified in STR and throw errors if appropriate.  Put the results if
12732    they are valid in RES and ISA_FLAGS.  Return whether the option is
12733    valid.  */
12734
12735 static bool
12736 aarch64_validate_mcpu (const char *str, const struct processor **res,
12737                        uint64_t *isa_flags)
12738 {
12739   std::string invalid_extension;
12740   enum aarch64_parse_opt_result parse_res
12741     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
12742
12743   if (parse_res == AARCH64_PARSE_OK)
12744     return true;
12745
12746   switch (parse_res)
12747     {
12748       case AARCH64_PARSE_MISSING_ARG:
12749         error ("missing cpu name in %<-mcpu=%s%>", str);
12750         break;
12751       case AARCH64_PARSE_INVALID_ARG:
12752         error ("unknown value %qs for %<-mcpu%>", str);
12753         aarch64_print_hint_for_core (str);
12754         break;
12755       case AARCH64_PARSE_INVALID_FEATURE:
12756         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
12757                invalid_extension.c_str (), str);
12758         aarch64_print_hint_for_extensions (invalid_extension);
12759         break;
12760       default:
12761         gcc_unreachable ();
12762     }
12763
12764   return false;
12765 }
12766
12767 /* Parses CONST_STR for branch protection features specified in
12768    aarch64_branch_protect_types, and set any global variables required.  Returns
12769    the parsing result and assigns LAST_STR to the last processed token from
12770    CONST_STR so that it can be used for error reporting.  */
12771
12772 static enum
12773 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
12774                                                           char** last_str)
12775 {
12776   char *str_root = xstrdup (const_str);
12777   char* token_save = NULL;
12778   char *str = strtok_r (str_root, "+", &token_save);
12779   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
12780   if (!str)
12781     res = AARCH64_PARSE_MISSING_ARG;
12782   else
12783     {
12784       char *next_str = strtok_r (NULL, "+", &token_save);
12785       /* Reset the branch protection features to their defaults.  */
12786       aarch64_handle_no_branch_protection (NULL, NULL);
12787
12788       while (str && res == AARCH64_PARSE_OK)
12789         {
12790           const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
12791           bool found = false;
12792           /* Search for this type.  */
12793           while (type && type->name && !found && res == AARCH64_PARSE_OK)
12794             {
12795               if (strcmp (str, type->name) == 0)
12796                 {
12797                   found = true;
12798                   res = type->handler (str, next_str);
12799                   str = next_str;
12800                   next_str = strtok_r (NULL, "+", &token_save);
12801                 }
12802               else
12803                 type++;
12804             }
12805           if (found && res == AARCH64_PARSE_OK)
12806             {
12807               bool found_subtype = true;
12808               /* Loop through each token until we find one that isn't a
12809                  subtype.  */
12810               while (found_subtype)
12811                 {
12812                   found_subtype = false;
12813                   const aarch64_branch_protect_type *subtype = type->subtypes;
12814                   /* Search for the subtype.  */
12815                   while (str && subtype && subtype->name && !found_subtype
12816                           && res == AARCH64_PARSE_OK)
12817                     {
12818                       if (strcmp (str, subtype->name) == 0)
12819                         {
12820                           found_subtype = true;
12821                           res = subtype->handler (str, next_str);
12822                           str = next_str;
12823                           next_str = strtok_r (NULL, "+", &token_save);
12824                         }
12825                       else
12826                         subtype++;
12827                     }
12828                 }
12829             }
12830           else if (!found)
12831             res = AARCH64_PARSE_INVALID_ARG;
12832         }
12833     }
12834   /* Copy the last processed token into the argument to pass it back.
12835     Used by option and attribute validation to print the offending token.  */
12836   if (last_str)
12837     {
12838       if (str) strcpy (*last_str, str);
12839       else *last_str = NULL;
12840     }
12841   if (res == AARCH64_PARSE_OK)
12842     {
12843       /* If needed, alloc the accepted string then copy in const_str.
12844         Used by override_option_after_change_1.  */
12845       if (!accepted_branch_protection_string)
12846         accepted_branch_protection_string = (char *) xmalloc (
12847                                                       BRANCH_PROTECT_STR_MAX
12848                                                         + 1);
12849       strncpy (accepted_branch_protection_string, const_str,
12850                 BRANCH_PROTECT_STR_MAX + 1);
12851       /* Forcibly null-terminate.  */
12852       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
12853     }
12854   return res;
12855 }
12856
12857 static bool
12858 aarch64_validate_mbranch_protection (const char *const_str)
12859 {
12860   char *str = (char *) xmalloc (strlen (const_str));
12861   enum aarch64_parse_opt_result res =
12862     aarch64_parse_branch_protection (const_str, &str);
12863   if (res == AARCH64_PARSE_INVALID_ARG)
12864     error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
12865   else if (res == AARCH64_PARSE_MISSING_ARG)
12866     error ("missing argument for %<-mbranch-protection=%>");
12867   free (str);
12868   return res == AARCH64_PARSE_OK;
12869 }
12870
12871 /* Validate a command-line -march option.  Parse the arch and extensions
12872    (if any) specified in STR and throw errors if appropriate.  Put the
12873    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
12874    option is valid.  */
12875
12876 static bool
12877 aarch64_validate_march (const char *str, const struct processor **res,
12878                          uint64_t *isa_flags)
12879 {
12880   std::string invalid_extension;
12881   enum aarch64_parse_opt_result parse_res
12882     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
12883
12884   if (parse_res == AARCH64_PARSE_OK)
12885     return true;
12886
12887   switch (parse_res)
12888     {
12889       case AARCH64_PARSE_MISSING_ARG:
12890         error ("missing arch name in %<-march=%s%>", str);
12891         break;
12892       case AARCH64_PARSE_INVALID_ARG:
12893         error ("unknown value %qs for %<-march%>", str);
12894         aarch64_print_hint_for_arch (str);
12895         break;
12896       case AARCH64_PARSE_INVALID_FEATURE:
12897         error ("invalid feature modifier %qs in %<-march=%s%>",
12898                invalid_extension.c_str (), str);
12899         aarch64_print_hint_for_extensions (invalid_extension);
12900         break;
12901       default:
12902         gcc_unreachable ();
12903     }
12904
12905   return false;
12906 }
12907
12908 /* Validate a command-line -mtune option.  Parse the cpu
12909    specified in STR and throw errors if appropriate.  Put the
12910    result, if it is valid, in RES.  Return whether the option is
12911    valid.  */
12912
12913 static bool
12914 aarch64_validate_mtune (const char *str, const struct processor **res)
12915 {
12916   enum aarch64_parse_opt_result parse_res
12917     = aarch64_parse_tune (str, res);
12918
12919   if (parse_res == AARCH64_PARSE_OK)
12920     return true;
12921
12922   switch (parse_res)
12923     {
12924       case AARCH64_PARSE_MISSING_ARG:
12925         error ("missing cpu name in %<-mtune=%s%>", str);
12926         break;
12927       case AARCH64_PARSE_INVALID_ARG:
12928         error ("unknown value %qs for %<-mtune%>", str);
12929         aarch64_print_hint_for_core (str);
12930         break;
12931       default:
12932         gcc_unreachable ();
12933     }
12934   return false;
12935 }
12936
12937 /* Return the CPU corresponding to the enum CPU.
12938    If it doesn't specify a cpu, return the default.  */
12939
12940 static const struct processor *
12941 aarch64_get_tune_cpu (enum aarch64_processor cpu)
12942 {
12943   if (cpu != aarch64_none)
12944     return &all_cores[cpu];
12945
12946   /* The & 0x3f is to extract the bottom 6 bits that encode the
12947      default cpu as selected by the --with-cpu GCC configure option
12948      in config.gcc.
12949      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
12950      flags mechanism should be reworked to make it more sane.  */
12951   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12952 }
12953
12954 /* Return the architecture corresponding to the enum ARCH.
12955    If it doesn't specify a valid architecture, return the default.  */
12956
12957 static const struct processor *
12958 aarch64_get_arch (enum aarch64_arch arch)
12959 {
12960   if (arch != aarch64_no_arch)
12961     return &all_architectures[arch];
12962
12963   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12964
12965   return &all_architectures[cpu->arch];
12966 }
12967
12968 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
12969
12970 static poly_uint16
12971 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
12972 {
12973   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
12974      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
12975      deciding which .md file patterns to use and when deciding whether
12976      something is a legitimate address or constant.  */
12977   if (value == SVE_SCALABLE || value == SVE_128)
12978     return poly_uint16 (2, 2);
12979   else
12980     return (int) value / 64;
12981 }
12982
12983 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
12984    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
12985    tuning structs.  In particular it must set selected_tune and
12986    aarch64_isa_flags that define the available ISA features and tuning
12987    decisions.  It must also set selected_arch as this will be used to
12988    output the .arch asm tags for each function.  */
12989
12990 static void
12991 aarch64_override_options (void)
12992 {
12993   uint64_t cpu_isa = 0;
12994   uint64_t arch_isa = 0;
12995   aarch64_isa_flags = 0;
12996
12997   bool valid_cpu = true;
12998   bool valid_tune = true;
12999   bool valid_arch = true;
13000
13001   selected_cpu = NULL;
13002   selected_arch = NULL;
13003   selected_tune = NULL;
13004
13005   if (aarch64_branch_protection_string)
13006     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
13007
13008   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
13009      If either of -march or -mtune is given, they override their
13010      respective component of -mcpu.  */
13011   if (aarch64_cpu_string)
13012     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
13013                                         &cpu_isa);
13014
13015   if (aarch64_arch_string)
13016     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
13017                                           &arch_isa);
13018
13019   if (aarch64_tune_string)
13020     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
13021
13022 #ifdef SUBTARGET_OVERRIDE_OPTIONS
13023   SUBTARGET_OVERRIDE_OPTIONS;
13024 #endif
13025
13026   /* If the user did not specify a processor, choose the default
13027      one for them.  This will be the CPU set during configuration using
13028      --with-cpu, otherwise it is "generic".  */
13029   if (!selected_cpu)
13030     {
13031       if (selected_arch)
13032         {
13033           selected_cpu = &all_cores[selected_arch->ident];
13034           aarch64_isa_flags = arch_isa;
13035           explicit_arch = selected_arch->arch;
13036         }
13037       else
13038         {
13039           /* Get default configure-time CPU.  */
13040           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
13041           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
13042         }
13043
13044       if (selected_tune)
13045         explicit_tune_core = selected_tune->ident;
13046     }
13047   /* If both -mcpu and -march are specified check that they are architecturally
13048      compatible, warn if they're not and prefer the -march ISA flags.  */
13049   else if (selected_arch)
13050     {
13051       if (selected_arch->arch != selected_cpu->arch)
13052         {
13053           warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
13054                        all_architectures[selected_cpu->arch].name,
13055                        selected_arch->name);
13056         }
13057       aarch64_isa_flags = arch_isa;
13058       explicit_arch = selected_arch->arch;
13059       explicit_tune_core = selected_tune ? selected_tune->ident
13060                                           : selected_cpu->ident;
13061     }
13062   else
13063     {
13064       /* -mcpu but no -march.  */
13065       aarch64_isa_flags = cpu_isa;
13066       explicit_tune_core = selected_tune ? selected_tune->ident
13067                                           : selected_cpu->ident;
13068       gcc_assert (selected_cpu);
13069       selected_arch = &all_architectures[selected_cpu->arch];
13070       explicit_arch = selected_arch->arch;
13071     }
13072
13073   /* Set the arch as well as we will need it when outputing
13074      the .arch directive in assembly.  */
13075   if (!selected_arch)
13076     {
13077       gcc_assert (selected_cpu);
13078       selected_arch = &all_architectures[selected_cpu->arch];
13079     }
13080
13081   if (!selected_tune)
13082     selected_tune = selected_cpu;
13083
13084   if (aarch64_enable_bti == 2)
13085     {
13086 #ifdef TARGET_ENABLE_BTI
13087       aarch64_enable_bti = 1;
13088 #else
13089       aarch64_enable_bti = 0;
13090 #endif
13091     }
13092
13093   /* Return address signing is currently not supported for ILP32 targets.  For
13094      LP64 targets use the configured option in the absence of a command-line
13095      option for -mbranch-protection.  */
13096   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
13097     {
13098 #ifdef TARGET_ENABLE_PAC_RET
13099       aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
13100 #else
13101       aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
13102 #endif
13103     }
13104
13105 #ifndef HAVE_AS_MABI_OPTION
13106   /* The compiler may have been configured with 2.23.* binutils, which does
13107      not have support for ILP32.  */
13108   if (TARGET_ILP32)
13109     error ("assembler does not support %<-mabi=ilp32%>");
13110 #endif
13111
13112   /* Convert -msve-vector-bits to a VG count.  */
13113   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
13114
13115   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
13116     sorry ("return address signing is only supported for %<-mabi=lp64%>");
13117
13118   /* Make sure we properly set up the explicit options.  */
13119   if ((aarch64_cpu_string && valid_cpu)
13120        || (aarch64_tune_string && valid_tune))
13121     gcc_assert (explicit_tune_core != aarch64_none);
13122
13123   if ((aarch64_cpu_string && valid_cpu)
13124        || (aarch64_arch_string && valid_arch))
13125     gcc_assert (explicit_arch != aarch64_no_arch);
13126
13127   /* The pass to insert speculation tracking runs before
13128      shrink-wrapping and the latter does not know how to update the
13129      tracking status.  So disable it in this case.  */
13130   if (aarch64_track_speculation)
13131     flag_shrink_wrap = 0;
13132
13133   aarch64_override_options_internal (&global_options);
13134
13135   /* Save these options as the default ones in case we push and pop them later
13136      while processing functions with potential target attributes.  */
13137   target_option_default_node = target_option_current_node
13138       = build_target_option_node (&global_options);
13139 }
13140
13141 /* Implement targetm.override_options_after_change.  */
13142
13143 static void
13144 aarch64_override_options_after_change (void)
13145 {
13146   aarch64_override_options_after_change_1 (&global_options);
13147 }
13148
13149 static struct machine_function *
13150 aarch64_init_machine_status (void)
13151 {
13152   struct machine_function *machine;
13153   machine = ggc_cleared_alloc<machine_function> ();
13154   return machine;
13155 }
13156
13157 void
13158 aarch64_init_expanders (void)
13159 {
13160   init_machine_status = aarch64_init_machine_status;
13161 }
13162
13163 /* A checking mechanism for the implementation of the various code models.  */
13164 static void
13165 initialize_aarch64_code_model (struct gcc_options *opts)
13166 {
13167    if (opts->x_flag_pic)
13168      {
13169        switch (opts->x_aarch64_cmodel_var)
13170          {
13171          case AARCH64_CMODEL_TINY:
13172            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
13173            break;
13174          case AARCH64_CMODEL_SMALL:
13175 #ifdef HAVE_AS_SMALL_PIC_RELOCS
13176            aarch64_cmodel = (flag_pic == 2
13177                              ? AARCH64_CMODEL_SMALL_PIC
13178                              : AARCH64_CMODEL_SMALL_SPIC);
13179 #else
13180            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
13181 #endif
13182            break;
13183          case AARCH64_CMODEL_LARGE:
13184            sorry ("code model %qs with %<-f%s%>", "large",
13185                   opts->x_flag_pic > 1 ? "PIC" : "pic");
13186            break;
13187          default:
13188            gcc_unreachable ();
13189          }
13190      }
13191    else
13192      aarch64_cmodel = opts->x_aarch64_cmodel_var;
13193 }
13194
13195 /* Implement TARGET_OPTION_SAVE.  */
13196
13197 static void
13198 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
13199 {
13200   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
13201   ptr->x_aarch64_branch_protection_string
13202     = opts->x_aarch64_branch_protection_string;
13203 }
13204
13205 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
13206    using the information saved in PTR.  */
13207
13208 static void
13209 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
13210 {
13211   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
13212   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
13213   opts->x_explicit_arch = ptr->x_explicit_arch;
13214   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
13215   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
13216   opts->x_aarch64_branch_protection_string
13217     = ptr->x_aarch64_branch_protection_string;
13218   if (opts->x_aarch64_branch_protection_string)
13219     {
13220       aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
13221                                         NULL);
13222     }
13223
13224   aarch64_override_options_internal (opts);
13225 }
13226
13227 /* Implement TARGET_OPTION_PRINT.  */
13228
13229 static void
13230 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
13231 {
13232   const struct processor *cpu
13233     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
13234   uint64_t isa_flags = ptr->x_aarch64_isa_flags;
13235   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
13236   std::string extension
13237     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
13238
13239   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
13240   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
13241            arch->name, extension.c_str ());
13242 }
13243
13244 static GTY(()) tree aarch64_previous_fndecl;
13245
13246 void
13247 aarch64_reset_previous_fndecl (void)
13248 {
13249   aarch64_previous_fndecl = NULL;
13250 }
13251
13252 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
13253    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
13254    make sure optab availability predicates are recomputed when necessary.  */
13255
13256 void
13257 aarch64_save_restore_target_globals (tree new_tree)
13258 {
13259   if (TREE_TARGET_GLOBALS (new_tree))
13260     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
13261   else if (new_tree == target_option_default_node)
13262     restore_target_globals (&default_target_globals);
13263   else
13264     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
13265 }
13266
13267 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
13268    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
13269    of the function, if such exists.  This function may be called multiple
13270    times on a single function so use aarch64_previous_fndecl to avoid
13271    setting up identical state.  */
13272
13273 static void
13274 aarch64_set_current_function (tree fndecl)
13275 {
13276   if (!fndecl || fndecl == aarch64_previous_fndecl)
13277     return;
13278
13279   tree old_tree = (aarch64_previous_fndecl
13280                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
13281                    : NULL_TREE);
13282
13283   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13284
13285   /* If current function has no attributes but the previous one did,
13286      use the default node.  */
13287   if (!new_tree && old_tree)
13288     new_tree = target_option_default_node;
13289
13290   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
13291      the default have been handled by aarch64_save_restore_target_globals from
13292      aarch64_pragma_target_parse.  */
13293   if (old_tree == new_tree)
13294     return;
13295
13296   aarch64_previous_fndecl = fndecl;
13297
13298   /* First set the target options.  */
13299   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
13300
13301   aarch64_save_restore_target_globals (new_tree);
13302 }
13303
13304 /* Enum describing the various ways we can handle attributes.
13305    In many cases we can reuse the generic option handling machinery.  */
13306
13307 enum aarch64_attr_opt_type
13308 {
13309   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
13310   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
13311   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
13312   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
13313 };
13314
13315 /* All the information needed to handle a target attribute.
13316    NAME is the name of the attribute.
13317    ATTR_TYPE specifies the type of behavior of the attribute as described
13318    in the definition of enum aarch64_attr_opt_type.
13319    ALLOW_NEG is true if the attribute supports a "no-" form.
13320    HANDLER is the function that takes the attribute string as an argument
13321    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
13322    OPT_NUM is the enum specifying the option that the attribute modifies.
13323    This is needed for attributes that mirror the behavior of a command-line
13324    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
13325    aarch64_attr_enum.  */
13326
13327 struct aarch64_attribute_info
13328 {
13329   const char *name;
13330   enum aarch64_attr_opt_type attr_type;
13331   bool allow_neg;
13332   bool (*handler) (const char *);
13333   enum opt_code opt_num;
13334 };
13335
13336 /* Handle the ARCH_STR argument to the arch= target attribute.  */
13337
13338 static bool
13339 aarch64_handle_attr_arch (const char *str)
13340 {
13341   const struct processor *tmp_arch = NULL;
13342   std::string invalid_extension;
13343   enum aarch64_parse_opt_result parse_res
13344     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
13345
13346   if (parse_res == AARCH64_PARSE_OK)
13347     {
13348       gcc_assert (tmp_arch);
13349       selected_arch = tmp_arch;
13350       explicit_arch = selected_arch->arch;
13351       return true;
13352     }
13353
13354   switch (parse_res)
13355     {
13356       case AARCH64_PARSE_MISSING_ARG:
13357         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
13358         break;
13359       case AARCH64_PARSE_INVALID_ARG:
13360         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
13361         aarch64_print_hint_for_arch (str);
13362         break;
13363       case AARCH64_PARSE_INVALID_FEATURE:
13364         error ("invalid feature modifier %s of value (\"%s\") in "
13365                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13366         aarch64_print_hint_for_extensions (invalid_extension);
13367         break;
13368       default:
13369         gcc_unreachable ();
13370     }
13371
13372   return false;
13373 }
13374
13375 /* Handle the argument CPU_STR to the cpu= target attribute.  */
13376
13377 static bool
13378 aarch64_handle_attr_cpu (const char *str)
13379 {
13380   const struct processor *tmp_cpu = NULL;
13381   std::string invalid_extension;
13382   enum aarch64_parse_opt_result parse_res
13383     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
13384
13385   if (parse_res == AARCH64_PARSE_OK)
13386     {
13387       gcc_assert (tmp_cpu);
13388       selected_tune = tmp_cpu;
13389       explicit_tune_core = selected_tune->ident;
13390
13391       selected_arch = &all_architectures[tmp_cpu->arch];
13392       explicit_arch = selected_arch->arch;
13393       return true;
13394     }
13395
13396   switch (parse_res)
13397     {
13398       case AARCH64_PARSE_MISSING_ARG:
13399         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
13400         break;
13401       case AARCH64_PARSE_INVALID_ARG:
13402         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
13403         aarch64_print_hint_for_core (str);
13404         break;
13405       case AARCH64_PARSE_INVALID_FEATURE:
13406         error ("invalid feature modifier %s of value (\"%s\") in "
13407                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13408         aarch64_print_hint_for_extensions (invalid_extension);
13409         break;
13410       default:
13411         gcc_unreachable ();
13412     }
13413
13414   return false;
13415 }
13416
13417 /* Handle the argument STR to the branch-protection= attribute.  */
13418
13419  static bool
13420  aarch64_handle_attr_branch_protection (const char* str)
13421  {
13422   char *err_str = (char *) xmalloc (strlen (str));
13423   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
13424                                                                       &err_str);
13425   bool success = false;
13426   switch (res)
13427     {
13428      case AARCH64_PARSE_MISSING_ARG:
13429        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
13430               " attribute");
13431        break;
13432      case AARCH64_PARSE_INVALID_ARG:
13433        error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
13434               "=\")%> pragma or attribute", err_str);
13435        break;
13436      case AARCH64_PARSE_OK:
13437        success = true;
13438       /* Fall through.  */
13439      case AARCH64_PARSE_INVALID_FEATURE:
13440        break;
13441      default:
13442        gcc_unreachable ();
13443     }
13444   free (err_str);
13445   return success;
13446  }
13447
13448 /* Handle the argument STR to the tune= target attribute.  */
13449
13450 static bool
13451 aarch64_handle_attr_tune (const char *str)
13452 {
13453   const struct processor *tmp_tune = NULL;
13454   enum aarch64_parse_opt_result parse_res
13455     = aarch64_parse_tune (str, &tmp_tune);
13456
13457   if (parse_res == AARCH64_PARSE_OK)
13458     {
13459       gcc_assert (tmp_tune);
13460       selected_tune = tmp_tune;
13461       explicit_tune_core = selected_tune->ident;
13462       return true;
13463     }
13464
13465   switch (parse_res)
13466     {
13467       case AARCH64_PARSE_INVALID_ARG:
13468         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
13469         aarch64_print_hint_for_core (str);
13470         break;
13471       default:
13472         gcc_unreachable ();
13473     }
13474
13475   return false;
13476 }
13477
13478 /* Parse an architecture extensions target attribute string specified in STR.
13479    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
13480    if successful.  Update aarch64_isa_flags to reflect the ISA features
13481    modified.  */
13482
13483 static bool
13484 aarch64_handle_attr_isa_flags (char *str)
13485 {
13486   enum aarch64_parse_opt_result parse_res;
13487   uint64_t isa_flags = aarch64_isa_flags;
13488
13489   /* We allow "+nothing" in the beginning to clear out all architectural
13490      features if the user wants to handpick specific features.  */
13491   if (strncmp ("+nothing", str, 8) == 0)
13492     {
13493       isa_flags = 0;
13494       str += 8;
13495     }
13496
13497   std::string invalid_extension;
13498   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
13499
13500   if (parse_res == AARCH64_PARSE_OK)
13501     {
13502       aarch64_isa_flags = isa_flags;
13503       return true;
13504     }
13505
13506   switch (parse_res)
13507     {
13508       case AARCH64_PARSE_MISSING_ARG:
13509         error ("missing value in %<target()%> pragma or attribute");
13510         break;
13511
13512       case AARCH64_PARSE_INVALID_FEATURE:
13513         error ("invalid feature modifier %s of value (\"%s\") in "
13514                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13515         break;
13516
13517       default:
13518         gcc_unreachable ();
13519     }
13520
13521  return false;
13522 }
13523
13524 /* The target attributes that we support.  On top of these we also support just
13525    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
13526    handled explicitly in aarch64_process_one_target_attr.  */
13527
13528 static const struct aarch64_attribute_info aarch64_attributes[] =
13529 {
13530   { "general-regs-only", aarch64_attr_mask, false, NULL,
13531      OPT_mgeneral_regs_only },
13532   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
13533      OPT_mfix_cortex_a53_835769 },
13534   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
13535      OPT_mfix_cortex_a53_843419 },
13536   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
13537   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
13538   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
13539      OPT_momit_leaf_frame_pointer },
13540   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
13541   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
13542      OPT_march_ },
13543   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
13544   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
13545      OPT_mtune_ },
13546   { "branch-protection", aarch64_attr_custom, false,
13547      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
13548   { "sign-return-address", aarch64_attr_enum, false, NULL,
13549      OPT_msign_return_address_ },
13550   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
13551 };
13552
13553 /* Parse ARG_STR which contains the definition of one target attribute.
13554    Show appropriate errors if any or return true if the attribute is valid.  */
13555
13556 static bool
13557 aarch64_process_one_target_attr (char *arg_str)
13558 {
13559   bool invert = false;
13560
13561   size_t len = strlen (arg_str);
13562
13563   if (len == 0)
13564     {
13565       error ("malformed %<target()%> pragma or attribute");
13566       return false;
13567     }
13568
13569   char *str_to_check = (char *) alloca (len + 1);
13570   strcpy (str_to_check, arg_str);
13571
13572   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
13573      It is easier to detect and handle it explicitly here rather than going
13574      through the machinery for the rest of the target attributes in this
13575      function.  */
13576   if (*str_to_check == '+')
13577     return aarch64_handle_attr_isa_flags (str_to_check);
13578
13579   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
13580     {
13581       invert = true;
13582       str_to_check += 3;
13583     }
13584   char *arg = strchr (str_to_check, '=');
13585
13586   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
13587      and point ARG to "foo".  */
13588   if (arg)
13589     {
13590       *arg = '\0';
13591       arg++;
13592     }
13593   const struct aarch64_attribute_info *p_attr;
13594   bool found = false;
13595   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
13596     {
13597       /* If the names don't match up, or the user has given an argument
13598          to an attribute that doesn't accept one, or didn't give an argument
13599          to an attribute that expects one, fail to match.  */
13600       if (strcmp (str_to_check, p_attr->name) != 0)
13601         continue;
13602
13603       found = true;
13604       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
13605                               || p_attr->attr_type == aarch64_attr_enum;
13606
13607       if (attr_need_arg_p ^ (arg != NULL))
13608         {
13609           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
13610           return false;
13611         }
13612
13613       /* If the name matches but the attribute does not allow "no-" versions
13614          then we can't match.  */
13615       if (invert && !p_attr->allow_neg)
13616         {
13617           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
13618           return false;
13619         }
13620
13621       switch (p_attr->attr_type)
13622         {
13623         /* Has a custom handler registered.
13624            For example, cpu=, arch=, tune=.  */
13625           case aarch64_attr_custom:
13626             gcc_assert (p_attr->handler);
13627             if (!p_attr->handler (arg))
13628               return false;
13629             break;
13630
13631           /* Either set or unset a boolean option.  */
13632           case aarch64_attr_bool:
13633             {
13634               struct cl_decoded_option decoded;
13635
13636               generate_option (p_attr->opt_num, NULL, !invert,
13637                                CL_TARGET, &decoded);
13638               aarch64_handle_option (&global_options, &global_options_set,
13639                                       &decoded, input_location);
13640               break;
13641             }
13642           /* Set or unset a bit in the target_flags.  aarch64_handle_option
13643              should know what mask to apply given the option number.  */
13644           case aarch64_attr_mask:
13645             {
13646               struct cl_decoded_option decoded;
13647               /* We only need to specify the option number.
13648                  aarch64_handle_option will know which mask to apply.  */
13649               decoded.opt_index = p_attr->opt_num;
13650               decoded.value = !invert;
13651               aarch64_handle_option (&global_options, &global_options_set,
13652                                       &decoded, input_location);
13653               break;
13654             }
13655           /* Use the option setting machinery to set an option to an enum.  */
13656           case aarch64_attr_enum:
13657             {
13658               gcc_assert (arg);
13659               bool valid;
13660               int value;
13661               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
13662                                               &value, CL_TARGET);
13663               if (valid)
13664                 {
13665                   set_option (&global_options, NULL, p_attr->opt_num, value,
13666                               NULL, DK_UNSPECIFIED, input_location,
13667                               global_dc);
13668                 }
13669               else
13670                 {
13671                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
13672                 }
13673               break;
13674             }
13675           default:
13676             gcc_unreachable ();
13677         }
13678     }
13679
13680   /* If we reached here we either have found an attribute and validated
13681      it or didn't match any.  If we matched an attribute but its arguments
13682      were malformed we will have returned false already.  */
13683   return found;
13684 }
13685
13686 /* Count how many times the character C appears in
13687    NULL-terminated string STR.  */
13688
13689 static unsigned int
13690 num_occurences_in_str (char c, char *str)
13691 {
13692   unsigned int res = 0;
13693   while (*str != '\0')
13694     {
13695       if (*str == c)
13696         res++;
13697
13698       str++;
13699     }
13700
13701   return res;
13702 }
13703
13704 /* Parse the tree in ARGS that contains the target attribute information
13705    and update the global target options space.  */
13706
13707 bool
13708 aarch64_process_target_attr (tree args)
13709 {
13710   if (TREE_CODE (args) == TREE_LIST)
13711     {
13712       do
13713         {
13714           tree head = TREE_VALUE (args);
13715           if (head)
13716             {
13717               if (!aarch64_process_target_attr (head))
13718                 return false;
13719             }
13720           args = TREE_CHAIN (args);
13721         } while (args);
13722
13723       return true;
13724     }
13725
13726   if (TREE_CODE (args) != STRING_CST)
13727     {
13728       error ("attribute %<target%> argument not a string");
13729       return false;
13730     }
13731
13732   size_t len = strlen (TREE_STRING_POINTER (args));
13733   char *str_to_check = (char *) alloca (len + 1);
13734   strcpy (str_to_check, TREE_STRING_POINTER (args));
13735
13736   if (len == 0)
13737     {
13738       error ("malformed %<target()%> pragma or attribute");
13739       return false;
13740     }
13741
13742   /* Used to catch empty spaces between commas i.e.
13743      attribute ((target ("attr1,,attr2"))).  */
13744   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
13745
13746   /* Handle multiple target attributes separated by ','.  */
13747   char *token = strtok_r (str_to_check, ",", &str_to_check);
13748
13749   unsigned int num_attrs = 0;
13750   while (token)
13751     {
13752       num_attrs++;
13753       if (!aarch64_process_one_target_attr (token))
13754         {
13755           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
13756           return false;
13757         }
13758
13759       token = strtok_r (NULL, ",", &str_to_check);
13760     }
13761
13762   if (num_attrs != num_commas + 1)
13763     {
13764       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
13765       return false;
13766     }
13767
13768   return true;
13769 }
13770
13771 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
13772    process attribute ((target ("..."))).  */
13773
13774 static bool
13775 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
13776 {
13777   struct cl_target_option cur_target;
13778   bool ret;
13779   tree old_optimize;
13780   tree new_target, new_optimize;
13781   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13782
13783   /* If what we're processing is the current pragma string then the
13784      target option node is already stored in target_option_current_node
13785      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
13786      having to re-parse the string.  This is especially useful to keep
13787      arm_neon.h compile times down since that header contains a lot
13788      of intrinsics enclosed in pragmas.  */
13789   if (!existing_target && args == current_target_pragma)
13790     {
13791       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
13792       return true;
13793     }
13794   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13795
13796   old_optimize = build_optimization_node (&global_options);
13797   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13798
13799   /* If the function changed the optimization levels as well as setting
13800      target options, start with the optimizations specified.  */
13801   if (func_optimize && func_optimize != old_optimize)
13802     cl_optimization_restore (&global_options,
13803                              TREE_OPTIMIZATION (func_optimize));
13804
13805   /* Save the current target options to restore at the end.  */
13806   cl_target_option_save (&cur_target, &global_options);
13807
13808   /* If fndecl already has some target attributes applied to it, unpack
13809      them so that we add this attribute on top of them, rather than
13810      overwriting them.  */
13811   if (existing_target)
13812     {
13813       struct cl_target_option *existing_options
13814         = TREE_TARGET_OPTION (existing_target);
13815
13816       if (existing_options)
13817         cl_target_option_restore (&global_options, existing_options);
13818     }
13819   else
13820     cl_target_option_restore (&global_options,
13821                         TREE_TARGET_OPTION (target_option_current_node));
13822
13823   ret = aarch64_process_target_attr (args);
13824
13825   /* Set up any additional state.  */
13826   if (ret)
13827     {
13828       aarch64_override_options_internal (&global_options);
13829       /* Initialize SIMD builtins if we haven't already.
13830          Set current_target_pragma to NULL for the duration so that
13831          the builtin initialization code doesn't try to tag the functions
13832          being built with the attributes specified by any current pragma, thus
13833          going into an infinite recursion.  */
13834       if (TARGET_SIMD)
13835         {
13836           tree saved_current_target_pragma = current_target_pragma;
13837           current_target_pragma = NULL;
13838           aarch64_init_simd_builtins ();
13839           current_target_pragma = saved_current_target_pragma;
13840         }
13841       new_target = build_target_option_node (&global_options);
13842     }
13843   else
13844     new_target = NULL;
13845
13846   new_optimize = build_optimization_node (&global_options);
13847
13848   if (fndecl && ret)
13849     {
13850       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
13851
13852       if (old_optimize != new_optimize)
13853         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
13854     }
13855
13856   cl_target_option_restore (&global_options, &cur_target);
13857
13858   if (old_optimize != new_optimize)
13859     cl_optimization_restore (&global_options,
13860                              TREE_OPTIMIZATION (old_optimize));
13861   return ret;
13862 }
13863
13864 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
13865    tri-bool options (yes, no, don't care) and the default value is
13866    DEF, determine whether to reject inlining.  */
13867
13868 static bool
13869 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
13870                                      int dont_care, int def)
13871 {
13872   /* If the callee doesn't care, always allow inlining.  */
13873   if (callee == dont_care)
13874     return true;
13875
13876   /* If the caller doesn't care, always allow inlining.  */
13877   if (caller == dont_care)
13878     return true;
13879
13880   /* Otherwise, allow inlining if either the callee and caller values
13881      agree, or if the callee is using the default value.  */
13882   return (callee == caller || callee == def);
13883 }
13884
13885 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
13886    to inline CALLEE into CALLER based on target-specific info.
13887    Make sure that the caller and callee have compatible architectural
13888    features.  Then go through the other possible target attributes
13889    and see if they can block inlining.  Try not to reject always_inline
13890    callees unless they are incompatible architecturally.  */
13891
13892 static bool
13893 aarch64_can_inline_p (tree caller, tree callee)
13894 {
13895   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
13896   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
13897
13898   struct cl_target_option *caller_opts
13899         = TREE_TARGET_OPTION (caller_tree ? caller_tree
13900                                            : target_option_default_node);
13901
13902   struct cl_target_option *callee_opts
13903         = TREE_TARGET_OPTION (callee_tree ? callee_tree
13904                                            : target_option_default_node);
13905
13906   /* Callee's ISA flags should be a subset of the caller's.  */
13907   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
13908        != callee_opts->x_aarch64_isa_flags)
13909     return false;
13910
13911   /* Allow non-strict aligned functions inlining into strict
13912      aligned ones.  */
13913   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
13914        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
13915       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
13916            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
13917     return false;
13918
13919   bool always_inline = lookup_attribute ("always_inline",
13920                                           DECL_ATTRIBUTES (callee));
13921
13922   /* If the architectural features match up and the callee is always_inline
13923      then the other attributes don't matter.  */
13924   if (always_inline)
13925     return true;
13926
13927   if (caller_opts->x_aarch64_cmodel_var
13928       != callee_opts->x_aarch64_cmodel_var)
13929     return false;
13930
13931   if (caller_opts->x_aarch64_tls_dialect
13932       != callee_opts->x_aarch64_tls_dialect)
13933     return false;
13934
13935   /* Honour explicit requests to workaround errata.  */
13936   if (!aarch64_tribools_ok_for_inlining_p (
13937           caller_opts->x_aarch64_fix_a53_err835769,
13938           callee_opts->x_aarch64_fix_a53_err835769,
13939           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
13940     return false;
13941
13942   if (!aarch64_tribools_ok_for_inlining_p (
13943           caller_opts->x_aarch64_fix_a53_err843419,
13944           callee_opts->x_aarch64_fix_a53_err843419,
13945           2, TARGET_FIX_ERR_A53_843419))
13946     return false;
13947
13948   /* If the user explicitly specified -momit-leaf-frame-pointer for the
13949      caller and calle and they don't match up, reject inlining.  */
13950   if (!aarch64_tribools_ok_for_inlining_p (
13951           caller_opts->x_flag_omit_leaf_frame_pointer,
13952           callee_opts->x_flag_omit_leaf_frame_pointer,
13953           2, 1))
13954     return false;
13955
13956   /* If the callee has specific tuning overrides, respect them.  */
13957   if (callee_opts->x_aarch64_override_tune_string != NULL
13958       && caller_opts->x_aarch64_override_tune_string == NULL)
13959     return false;
13960
13961   /* If the user specified tuning override strings for the
13962      caller and callee and they don't match up, reject inlining.
13963      We just do a string compare here, we don't analyze the meaning
13964      of the string, as it would be too costly for little gain.  */
13965   if (callee_opts->x_aarch64_override_tune_string
13966       && caller_opts->x_aarch64_override_tune_string
13967       && (strcmp (callee_opts->x_aarch64_override_tune_string,
13968                   caller_opts->x_aarch64_override_tune_string) != 0))
13969     return false;
13970
13971   return true;
13972 }
13973
13974 /* Return true if SYMBOL_REF X binds locally.  */
13975
13976 static bool
13977 aarch64_symbol_binds_local_p (const_rtx x)
13978 {
13979   return (SYMBOL_REF_DECL (x)
13980           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
13981           : SYMBOL_REF_LOCAL_P (x));
13982 }
13983
13984 /* Return true if SYMBOL_REF X is thread local */
13985 static bool
13986 aarch64_tls_symbol_p (rtx x)
13987 {
13988   if (! TARGET_HAVE_TLS)
13989     return false;
13990
13991   if (GET_CODE (x) != SYMBOL_REF)
13992     return false;
13993
13994   return SYMBOL_REF_TLS_MODEL (x) != 0;
13995 }
13996
13997 /* Classify a TLS symbol into one of the TLS kinds.  */
13998 enum aarch64_symbol_type
13999 aarch64_classify_tls_symbol (rtx x)
14000 {
14001   enum tls_model tls_kind = tls_symbolic_operand_type (x);
14002
14003   switch (tls_kind)
14004     {
14005     case TLS_MODEL_GLOBAL_DYNAMIC:
14006     case TLS_MODEL_LOCAL_DYNAMIC:
14007       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
14008
14009     case TLS_MODEL_INITIAL_EXEC:
14010       switch (aarch64_cmodel)
14011         {
14012         case AARCH64_CMODEL_TINY:
14013         case AARCH64_CMODEL_TINY_PIC:
14014           return SYMBOL_TINY_TLSIE;
14015         default:
14016           return SYMBOL_SMALL_TLSIE;
14017         }
14018
14019     case TLS_MODEL_LOCAL_EXEC:
14020       if (aarch64_tls_size == 12)
14021         return SYMBOL_TLSLE12;
14022       else if (aarch64_tls_size == 24)
14023         return SYMBOL_TLSLE24;
14024       else if (aarch64_tls_size == 32)
14025         return SYMBOL_TLSLE32;
14026       else if (aarch64_tls_size == 48)
14027         return SYMBOL_TLSLE48;
14028       else
14029         gcc_unreachable ();
14030
14031     case TLS_MODEL_EMULATED:
14032     case TLS_MODEL_NONE:
14033       return SYMBOL_FORCE_TO_MEM;
14034
14035     default:
14036       gcc_unreachable ();
14037     }
14038 }
14039
14040 /* Return the correct method for accessing X + OFFSET, where X is either
14041    a SYMBOL_REF or LABEL_REF.  */
14042
14043 enum aarch64_symbol_type
14044 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
14045 {
14046   if (GET_CODE (x) == LABEL_REF)
14047     {
14048       switch (aarch64_cmodel)
14049         {
14050         case AARCH64_CMODEL_LARGE:
14051           return SYMBOL_FORCE_TO_MEM;
14052
14053         case AARCH64_CMODEL_TINY_PIC:
14054         case AARCH64_CMODEL_TINY:
14055           return SYMBOL_TINY_ABSOLUTE;
14056
14057         case AARCH64_CMODEL_SMALL_SPIC:
14058         case AARCH64_CMODEL_SMALL_PIC:
14059         case AARCH64_CMODEL_SMALL:
14060           return SYMBOL_SMALL_ABSOLUTE;
14061
14062         default:
14063           gcc_unreachable ();
14064         }
14065     }
14066
14067   if (GET_CODE (x) == SYMBOL_REF)
14068     {
14069       if (aarch64_tls_symbol_p (x))
14070         return aarch64_classify_tls_symbol (x);
14071
14072       switch (aarch64_cmodel)
14073         {
14074         case AARCH64_CMODEL_TINY:
14075           /* When we retrieve symbol + offset address, we have to make sure
14076              the offset does not cause overflow of the final address.  But
14077              we have no way of knowing the address of symbol at compile time
14078              so we can't accurately say if the distance between the PC and
14079              symbol + offset is outside the addressible range of +/-1M in the
14080              TINY code model.  So we rely on images not being greater than
14081              1M and cap the offset at 1M and anything beyond 1M will have to
14082              be loaded using an alternative mechanism.  Furthermore if the
14083              symbol is a weak reference to something that isn't known to
14084              resolve to a symbol in this module, then force to memory.  */
14085           if ((SYMBOL_REF_WEAK (x)
14086                && !aarch64_symbol_binds_local_p (x))
14087               || !IN_RANGE (offset, -1048575, 1048575))
14088             return SYMBOL_FORCE_TO_MEM;
14089           return SYMBOL_TINY_ABSOLUTE;
14090
14091         case AARCH64_CMODEL_SMALL:
14092           /* Same reasoning as the tiny code model, but the offset cap here is
14093              4G.  */
14094           if ((SYMBOL_REF_WEAK (x)
14095                && !aarch64_symbol_binds_local_p (x))
14096               || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
14097                             HOST_WIDE_INT_C (4294967264)))
14098             return SYMBOL_FORCE_TO_MEM;
14099           return SYMBOL_SMALL_ABSOLUTE;
14100
14101         case AARCH64_CMODEL_TINY_PIC:
14102           if (!aarch64_symbol_binds_local_p (x))
14103             return SYMBOL_TINY_GOT;
14104           return SYMBOL_TINY_ABSOLUTE;
14105
14106         case AARCH64_CMODEL_SMALL_SPIC:
14107         case AARCH64_CMODEL_SMALL_PIC:
14108           if (!aarch64_symbol_binds_local_p (x))
14109             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
14110                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
14111           return SYMBOL_SMALL_ABSOLUTE;
14112
14113         case AARCH64_CMODEL_LARGE:
14114           /* This is alright even in PIC code as the constant
14115              pool reference is always PC relative and within
14116              the same translation unit.  */
14117           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
14118             return SYMBOL_SMALL_ABSOLUTE;
14119           else
14120             return SYMBOL_FORCE_TO_MEM;
14121
14122         default:
14123           gcc_unreachable ();
14124         }
14125     }
14126
14127   /* By default push everything into the constant pool.  */
14128   return SYMBOL_FORCE_TO_MEM;
14129 }
14130
14131 bool
14132 aarch64_constant_address_p (rtx x)
14133 {
14134   return (CONSTANT_P (x) && memory_address_p (DImode, x));
14135 }
14136
14137 bool
14138 aarch64_legitimate_pic_operand_p (rtx x)
14139 {
14140   if (GET_CODE (x) == SYMBOL_REF
14141       || (GET_CODE (x) == CONST
14142           && GET_CODE (XEXP (x, 0)) == PLUS
14143           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
14144      return false;
14145
14146   return true;
14147 }
14148
14149 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
14150    that should be rematerialized rather than spilled.  */
14151
14152 static bool
14153 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
14154 {
14155   /* Support CSE and rematerialization of common constants.  */
14156   if (CONST_INT_P (x)
14157       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
14158       || GET_CODE (x) == CONST_VECTOR)
14159     return true;
14160
14161   /* Do not allow vector struct mode constants for Advanced SIMD.
14162      We could support 0 and -1 easily, but they need support in
14163      aarch64-simd.md.  */
14164   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14165   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
14166     return false;
14167
14168   /* Only accept variable-length vector constants if they can be
14169      handled directly.
14170
14171      ??? It would be possible to handle rematerialization of other
14172      constants via secondary reloads.  */
14173   if (vec_flags & VEC_ANY_SVE)
14174     return aarch64_simd_valid_immediate (x, NULL);
14175
14176   if (GET_CODE (x) == HIGH)
14177     x = XEXP (x, 0);
14178
14179   /* Accept polynomial constants that can be calculated by using the
14180      destination of a move as the sole temporary.  Constants that
14181      require a second temporary cannot be rematerialized (they can't be
14182      forced to memory and also aren't legitimate constants).  */
14183   poly_int64 offset;
14184   if (poly_int_rtx_p (x, &offset))
14185     return aarch64_offset_temporaries (false, offset) <= 1;
14186
14187   /* If an offset is being added to something else, we need to allow the
14188      base to be moved into the destination register, meaning that there
14189      are no free temporaries for the offset.  */
14190   x = strip_offset (x, &offset);
14191   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
14192     return false;
14193
14194   /* Do not allow const (plus (anchor_symbol, const_int)).  */
14195   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
14196     return false;
14197
14198   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
14199      so spilling them is better than rematerialization.  */
14200   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
14201     return true;
14202
14203   /* Label references are always constant.  */
14204   if (GET_CODE (x) == LABEL_REF)
14205     return true;
14206
14207   return false;
14208 }
14209
14210 rtx
14211 aarch64_load_tp (rtx target)
14212 {
14213   if (!target
14214       || GET_MODE (target) != Pmode
14215       || !register_operand (target, Pmode))
14216     target = gen_reg_rtx (Pmode);
14217
14218   /* Can return in any reg.  */
14219   emit_insn (gen_aarch64_load_tp_hard (target));
14220   return target;
14221 }
14222
14223 /* On AAPCS systems, this is the "struct __va_list".  */
14224 static GTY(()) tree va_list_type;
14225
14226 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
14227    Return the type to use as __builtin_va_list.
14228
14229    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
14230
14231    struct __va_list
14232    {
14233      void *__stack;
14234      void *__gr_top;
14235      void *__vr_top;
14236      int   __gr_offs;
14237      int   __vr_offs;
14238    };  */
14239
14240 static tree
14241 aarch64_build_builtin_va_list (void)
14242 {
14243   tree va_list_name;
14244   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14245
14246   /* Create the type.  */
14247   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
14248   /* Give it the required name.  */
14249   va_list_name = build_decl (BUILTINS_LOCATION,
14250                              TYPE_DECL,
14251                              get_identifier ("__va_list"),
14252                              va_list_type);
14253   DECL_ARTIFICIAL (va_list_name) = 1;
14254   TYPE_NAME (va_list_type) = va_list_name;
14255   TYPE_STUB_DECL (va_list_type) = va_list_name;
14256
14257   /* Create the fields.  */
14258   f_stack = build_decl (BUILTINS_LOCATION,
14259                         FIELD_DECL, get_identifier ("__stack"),
14260                         ptr_type_node);
14261   f_grtop = build_decl (BUILTINS_LOCATION,
14262                         FIELD_DECL, get_identifier ("__gr_top"),
14263                         ptr_type_node);
14264   f_vrtop = build_decl (BUILTINS_LOCATION,
14265                         FIELD_DECL, get_identifier ("__vr_top"),
14266                         ptr_type_node);
14267   f_groff = build_decl (BUILTINS_LOCATION,
14268                         FIELD_DECL, get_identifier ("__gr_offs"),
14269                         integer_type_node);
14270   f_vroff = build_decl (BUILTINS_LOCATION,
14271                         FIELD_DECL, get_identifier ("__vr_offs"),
14272                         integer_type_node);
14273
14274   /* Tell tree-stdarg pass about our internal offset fields.
14275      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
14276      purpose to identify whether the code is updating va_list internal
14277      offset fields through irregular way.  */
14278   va_list_gpr_counter_field = f_groff;
14279   va_list_fpr_counter_field = f_vroff;
14280
14281   DECL_ARTIFICIAL (f_stack) = 1;
14282   DECL_ARTIFICIAL (f_grtop) = 1;
14283   DECL_ARTIFICIAL (f_vrtop) = 1;
14284   DECL_ARTIFICIAL (f_groff) = 1;
14285   DECL_ARTIFICIAL (f_vroff) = 1;
14286
14287   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
14288   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
14289   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
14290   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
14291   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
14292
14293   TYPE_FIELDS (va_list_type) = f_stack;
14294   DECL_CHAIN (f_stack) = f_grtop;
14295   DECL_CHAIN (f_grtop) = f_vrtop;
14296   DECL_CHAIN (f_vrtop) = f_groff;
14297   DECL_CHAIN (f_groff) = f_vroff;
14298
14299   /* Compute its layout.  */
14300   layout_type (va_list_type);
14301
14302   return va_list_type;
14303 }
14304
14305 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
14306 static void
14307 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
14308 {
14309   const CUMULATIVE_ARGS *cum;
14310   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14311   tree stack, grtop, vrtop, groff, vroff;
14312   tree t;
14313   int gr_save_area_size = cfun->va_list_gpr_size;
14314   int vr_save_area_size = cfun->va_list_fpr_size;
14315   int vr_offset;
14316
14317   cum = &crtl->args.info;
14318   if (cfun->va_list_gpr_size)
14319     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
14320                              cfun->va_list_gpr_size);
14321   if (cfun->va_list_fpr_size)
14322     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
14323                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
14324
14325   if (!TARGET_FLOAT)
14326     {
14327       gcc_assert (cum->aapcs_nvrn == 0);
14328       vr_save_area_size = 0;
14329     }
14330
14331   f_stack = TYPE_FIELDS (va_list_type_node);
14332   f_grtop = DECL_CHAIN (f_stack);
14333   f_vrtop = DECL_CHAIN (f_grtop);
14334   f_groff = DECL_CHAIN (f_vrtop);
14335   f_vroff = DECL_CHAIN (f_groff);
14336
14337   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
14338                   NULL_TREE);
14339   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
14340                   NULL_TREE);
14341   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
14342                   NULL_TREE);
14343   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
14344                   NULL_TREE);
14345   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
14346                   NULL_TREE);
14347
14348   /* Emit code to initialize STACK, which points to the next varargs stack
14349      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
14350      by named arguments.  STACK is 8-byte aligned.  */
14351   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
14352   if (cum->aapcs_stack_size > 0)
14353     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
14354   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
14355   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14356
14357   /* Emit code to initialize GRTOP, the top of the GR save area.
14358      virtual_incoming_args_rtx should have been 16 byte aligned.  */
14359   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
14360   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
14361   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14362
14363   /* Emit code to initialize VRTOP, the top of the VR save area.
14364      This address is gr_save_area_bytes below GRTOP, rounded
14365      down to the next 16-byte boundary.  */
14366   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
14367   vr_offset = ROUND_UP (gr_save_area_size,
14368                         STACK_BOUNDARY / BITS_PER_UNIT);
14369
14370   if (vr_offset)
14371     t = fold_build_pointer_plus_hwi (t, -vr_offset);
14372   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
14373   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14374
14375   /* Emit code to initialize GROFF, the offset from GRTOP of the
14376      next GPR argument.  */
14377   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
14378               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
14379   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14380
14381   /* Likewise emit code to initialize VROFF, the offset from FTOP
14382      of the next VR argument.  */
14383   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
14384               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
14385   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14386 }
14387
14388 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
14389
14390 static tree
14391 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
14392                               gimple_seq *post_p ATTRIBUTE_UNUSED)
14393 {
14394   tree addr;
14395   bool indirect_p;
14396   bool is_ha;           /* is HFA or HVA.  */
14397   bool dw_align;        /* double-word align.  */
14398   machine_mode ag_mode = VOIDmode;
14399   int nregs;
14400   machine_mode mode;
14401
14402   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14403   tree stack, f_top, f_off, off, arg, roundup, on_stack;
14404   HOST_WIDE_INT size, rsize, adjust, align;
14405   tree t, u, cond1, cond2;
14406
14407   indirect_p = pass_va_arg_by_reference (type);
14408   if (indirect_p)
14409     type = build_pointer_type (type);
14410
14411   mode = TYPE_MODE (type);
14412
14413   f_stack = TYPE_FIELDS (va_list_type_node);
14414   f_grtop = DECL_CHAIN (f_stack);
14415   f_vrtop = DECL_CHAIN (f_grtop);
14416   f_groff = DECL_CHAIN (f_vrtop);
14417   f_vroff = DECL_CHAIN (f_groff);
14418
14419   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
14420                   f_stack, NULL_TREE);
14421   size = int_size_in_bytes (type);
14422
14423   bool abi_break;
14424   align
14425     = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
14426
14427   dw_align = false;
14428   adjust = 0;
14429   if (aarch64_vfp_is_call_or_return_candidate (mode,
14430                                                type,
14431                                                &ag_mode,
14432                                                &nregs,
14433                                                &is_ha))
14434     {
14435       /* No frontends can create types with variable-sized modes, so we
14436          shouldn't be asked to pass or return them.  */
14437       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
14438
14439       /* TYPE passed in fp/simd registers.  */
14440       if (!TARGET_FLOAT)
14441         aarch64_err_no_fpadvsimd (mode);
14442
14443       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
14444                       unshare_expr (valist), f_vrtop, NULL_TREE);
14445       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
14446                       unshare_expr (valist), f_vroff, NULL_TREE);
14447
14448       rsize = nregs * UNITS_PER_VREG;
14449
14450       if (is_ha)
14451         {
14452           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
14453             adjust = UNITS_PER_VREG - ag_size;
14454         }
14455       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14456                && size < UNITS_PER_VREG)
14457         {
14458           adjust = UNITS_PER_VREG - size;
14459         }
14460     }
14461   else
14462     {
14463       /* TYPE passed in general registers.  */
14464       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
14465                       unshare_expr (valist), f_grtop, NULL_TREE);
14466       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
14467                       unshare_expr (valist), f_groff, NULL_TREE);
14468       rsize = ROUND_UP (size, UNITS_PER_WORD);
14469       nregs = rsize / UNITS_PER_WORD;
14470
14471       if (align > 8)
14472         {
14473           if (abi_break && warn_psabi)
14474             inform (input_location, "parameter passing for argument of type "
14475                     "%qT changed in GCC 9.1", type);
14476           dw_align = true;
14477         }
14478
14479       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14480           && size < UNITS_PER_WORD)
14481         {
14482           adjust = UNITS_PER_WORD  - size;
14483         }
14484     }
14485
14486   /* Get a local temporary for the field value.  */
14487   off = get_initialized_tmp_var (f_off, pre_p, NULL);
14488
14489   /* Emit code to branch if off >= 0.  */
14490   t = build2 (GE_EXPR, boolean_type_node, off,
14491               build_int_cst (TREE_TYPE (off), 0));
14492   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
14493
14494   if (dw_align)
14495     {
14496       /* Emit: offs = (offs + 15) & -16.  */
14497       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14498                   build_int_cst (TREE_TYPE (off), 15));
14499       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
14500                   build_int_cst (TREE_TYPE (off), -16));
14501       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
14502     }
14503   else
14504     roundup = NULL;
14505
14506   /* Update ap.__[g|v]r_offs  */
14507   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14508               build_int_cst (TREE_TYPE (off), rsize));
14509   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
14510
14511   /* String up.  */
14512   if (roundup)
14513     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14514
14515   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
14516   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
14517               build_int_cst (TREE_TYPE (f_off), 0));
14518   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
14519
14520   /* String up: make sure the assignment happens before the use.  */
14521   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
14522   COND_EXPR_ELSE (cond1) = t;
14523
14524   /* Prepare the trees handling the argument that is passed on the stack;
14525      the top level node will store in ON_STACK.  */
14526   arg = get_initialized_tmp_var (stack, pre_p, NULL);
14527   if (align > 8)
14528     {
14529       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
14530       t = fold_build_pointer_plus_hwi (arg, 15);
14531       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14532                   build_int_cst (TREE_TYPE (t), -16));
14533       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
14534     }
14535   else
14536     roundup = NULL;
14537   /* Advance ap.__stack  */
14538   t = fold_build_pointer_plus_hwi (arg, size + 7);
14539   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14540               build_int_cst (TREE_TYPE (t), -8));
14541   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
14542   /* String up roundup and advance.  */
14543   if (roundup)
14544     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14545   /* String up with arg */
14546   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
14547   /* Big-endianness related address adjustment.  */
14548   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14549       && size < UNITS_PER_WORD)
14550   {
14551     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
14552                 size_int (UNITS_PER_WORD - size));
14553     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
14554   }
14555
14556   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
14557   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
14558
14559   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
14560   t = off;
14561   if (adjust)
14562     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
14563                 build_int_cst (TREE_TYPE (off), adjust));
14564
14565   t = fold_convert (sizetype, t);
14566   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
14567
14568   if (is_ha)
14569     {
14570       /* type ha; // treat as "struct {ftype field[n];}"
14571          ... [computing offs]
14572          for (i = 0; i <nregs; ++i, offs += 16)
14573            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
14574          return ha;  */
14575       int i;
14576       tree tmp_ha, field_t, field_ptr_t;
14577
14578       /* Declare a local variable.  */
14579       tmp_ha = create_tmp_var_raw (type, "ha");
14580       gimple_add_tmp_var (tmp_ha);
14581
14582       /* Establish the base type.  */
14583       switch (ag_mode)
14584         {
14585         case E_SFmode:
14586           field_t = float_type_node;
14587           field_ptr_t = float_ptr_type_node;
14588           break;
14589         case E_DFmode:
14590           field_t = double_type_node;
14591           field_ptr_t = double_ptr_type_node;
14592           break;
14593         case E_TFmode:
14594           field_t = long_double_type_node;
14595           field_ptr_t = long_double_ptr_type_node;
14596           break;
14597         case E_HFmode:
14598           field_t = aarch64_fp16_type_node;
14599           field_ptr_t = aarch64_fp16_ptr_type_node;
14600           break;
14601         case E_V2SImode:
14602         case E_V4SImode:
14603             {
14604               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
14605               field_t = build_vector_type_for_mode (innertype, ag_mode);
14606               field_ptr_t = build_pointer_type (field_t);
14607             }
14608           break;
14609         default:
14610           gcc_assert (0);
14611         }
14612
14613       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
14614       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
14615       addr = t;
14616       t = fold_convert (field_ptr_t, addr);
14617       t = build2 (MODIFY_EXPR, field_t,
14618                   build1 (INDIRECT_REF, field_t, tmp_ha),
14619                   build1 (INDIRECT_REF, field_t, t));
14620
14621       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
14622       for (i = 1; i < nregs; ++i)
14623         {
14624           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
14625           u = fold_convert (field_ptr_t, addr);
14626           u = build2 (MODIFY_EXPR, field_t,
14627                       build2 (MEM_REF, field_t, tmp_ha,
14628                               build_int_cst (field_ptr_t,
14629                                              (i *
14630                                               int_size_in_bytes (field_t)))),
14631                       build1 (INDIRECT_REF, field_t, u));
14632           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
14633         }
14634
14635       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
14636       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
14637     }
14638
14639   COND_EXPR_ELSE (cond2) = t;
14640   addr = fold_convert (build_pointer_type (type), cond1);
14641   addr = build_va_arg_indirect_ref (addr);
14642
14643   if (indirect_p)
14644     addr = build_va_arg_indirect_ref (addr);
14645
14646   return addr;
14647 }
14648
14649 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
14650
14651 static void
14652 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
14653                                 const function_arg_info &arg,
14654                                 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
14655 {
14656   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
14657   CUMULATIVE_ARGS local_cum;
14658   int gr_saved = cfun->va_list_gpr_size;
14659   int vr_saved = cfun->va_list_fpr_size;
14660
14661   /* The caller has advanced CUM up to, but not beyond, the last named
14662      argument.  Advance a local copy of CUM past the last "real" named
14663      argument, to find out how many registers are left over.  */
14664   local_cum = *cum;
14665   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
14666
14667   /* Found out how many registers we need to save.
14668      Honor tree-stdvar analysis results.  */
14669   if (cfun->va_list_gpr_size)
14670     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
14671                     cfun->va_list_gpr_size / UNITS_PER_WORD);
14672   if (cfun->va_list_fpr_size)
14673     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
14674                     cfun->va_list_fpr_size / UNITS_PER_VREG);
14675
14676   if (!TARGET_FLOAT)
14677     {
14678       gcc_assert (local_cum.aapcs_nvrn == 0);
14679       vr_saved = 0;
14680     }
14681
14682   if (!no_rtl)
14683     {
14684       if (gr_saved > 0)
14685         {
14686           rtx ptr, mem;
14687
14688           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
14689           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
14690                                - gr_saved * UNITS_PER_WORD);
14691           mem = gen_frame_mem (BLKmode, ptr);
14692           set_mem_alias_set (mem, get_varargs_alias_set ());
14693
14694           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
14695                                mem, gr_saved);
14696         }
14697       if (vr_saved > 0)
14698         {
14699           /* We can't use move_block_from_reg, because it will use
14700              the wrong mode, storing D regs only.  */
14701           machine_mode mode = TImode;
14702           int off, i, vr_start;
14703
14704           /* Set OFF to the offset from virtual_incoming_args_rtx of
14705              the first vector register.  The VR save area lies below
14706              the GR one, and is aligned to 16 bytes.  */
14707           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
14708                            STACK_BOUNDARY / BITS_PER_UNIT);
14709           off -= vr_saved * UNITS_PER_VREG;
14710
14711           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
14712           for (i = 0; i < vr_saved; ++i)
14713             {
14714               rtx ptr, mem;
14715
14716               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
14717               mem = gen_frame_mem (mode, ptr);
14718               set_mem_alias_set (mem, get_varargs_alias_set ());
14719               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
14720               off += UNITS_PER_VREG;
14721             }
14722         }
14723     }
14724
14725   /* We don't save the size into *PRETEND_SIZE because we want to avoid
14726      any complication of having crtl->args.pretend_args_size changed.  */
14727   cfun->machine->frame.saved_varargs_size
14728     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
14729                  STACK_BOUNDARY / BITS_PER_UNIT)
14730        + vr_saved * UNITS_PER_VREG);
14731 }
14732
14733 static void
14734 aarch64_conditional_register_usage (void)
14735 {
14736   int i;
14737   if (!TARGET_FLOAT)
14738     {
14739       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
14740         {
14741           fixed_regs[i] = 1;
14742           call_used_regs[i] = 1;
14743         }
14744     }
14745   if (!TARGET_SVE)
14746     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
14747       {
14748         fixed_regs[i] = 1;
14749         call_used_regs[i] = 1;
14750       }
14751
14752   /* When tracking speculation, we need a couple of call-clobbered registers
14753      to track the speculation state.  It would be nice to just use
14754      IP0 and IP1, but currently there are numerous places that just
14755      assume these registers are free for other uses (eg pointer
14756      authentication).  */
14757   if (aarch64_track_speculation)
14758     {
14759       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
14760       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
14761       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14762       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14763     }
14764 }
14765
14766 /* Walk down the type tree of TYPE counting consecutive base elements.
14767    If *MODEP is VOIDmode, then set it to the first valid floating point
14768    type.  If a non-floating point type is found, or if a floating point
14769    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
14770    otherwise return the count in the sub-tree.  */
14771 static int
14772 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
14773 {
14774   machine_mode mode;
14775   HOST_WIDE_INT size;
14776
14777   switch (TREE_CODE (type))
14778     {
14779     case REAL_TYPE:
14780       mode = TYPE_MODE (type);
14781       if (mode != DFmode && mode != SFmode
14782           && mode != TFmode && mode != HFmode)
14783         return -1;
14784
14785       if (*modep == VOIDmode)
14786         *modep = mode;
14787
14788       if (*modep == mode)
14789         return 1;
14790
14791       break;
14792
14793     case COMPLEX_TYPE:
14794       mode = TYPE_MODE (TREE_TYPE (type));
14795       if (mode != DFmode && mode != SFmode
14796           && mode != TFmode && mode != HFmode)
14797         return -1;
14798
14799       if (*modep == VOIDmode)
14800         *modep = mode;
14801
14802       if (*modep == mode)
14803         return 2;
14804
14805       break;
14806
14807     case VECTOR_TYPE:
14808       /* Use V2SImode and V4SImode as representatives of all 64-bit
14809          and 128-bit vector types.  */
14810       size = int_size_in_bytes (type);
14811       switch (size)
14812         {
14813         case 8:
14814           mode = V2SImode;
14815           break;
14816         case 16:
14817           mode = V4SImode;
14818           break;
14819         default:
14820           return -1;
14821         }
14822
14823       if (*modep == VOIDmode)
14824         *modep = mode;
14825
14826       /* Vector modes are considered to be opaque: two vectors are
14827          equivalent for the purposes of being homogeneous aggregates
14828          if they are the same size.  */
14829       if (*modep == mode)
14830         return 1;
14831
14832       break;
14833
14834     case ARRAY_TYPE:
14835       {
14836         int count;
14837         tree index = TYPE_DOMAIN (type);
14838
14839         /* Can't handle incomplete types nor sizes that are not
14840            fixed.  */
14841         if (!COMPLETE_TYPE_P (type)
14842             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14843           return -1;
14844
14845         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
14846         if (count == -1
14847             || !index
14848             || !TYPE_MAX_VALUE (index)
14849             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
14850             || !TYPE_MIN_VALUE (index)
14851             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
14852             || count < 0)
14853           return -1;
14854
14855         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
14856                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
14857
14858         /* There must be no padding.  */
14859         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14860                       count * GET_MODE_BITSIZE (*modep)))
14861           return -1;
14862
14863         return count;
14864       }
14865
14866     case RECORD_TYPE:
14867       {
14868         int count = 0;
14869         int sub_count;
14870         tree field;
14871
14872         /* Can't handle incomplete types nor sizes that are not
14873            fixed.  */
14874         if (!COMPLETE_TYPE_P (type)
14875             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14876           return -1;
14877
14878         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14879           {
14880             if (TREE_CODE (field) != FIELD_DECL)
14881               continue;
14882
14883             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14884             if (sub_count < 0)
14885               return -1;
14886             count += sub_count;
14887           }
14888
14889         /* There must be no padding.  */
14890         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14891                       count * GET_MODE_BITSIZE (*modep)))
14892           return -1;
14893
14894         return count;
14895       }
14896
14897     case UNION_TYPE:
14898     case QUAL_UNION_TYPE:
14899       {
14900         /* These aren't very interesting except in a degenerate case.  */
14901         int count = 0;
14902         int sub_count;
14903         tree field;
14904
14905         /* Can't handle incomplete types nor sizes that are not
14906            fixed.  */
14907         if (!COMPLETE_TYPE_P (type)
14908             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14909           return -1;
14910
14911         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14912           {
14913             if (TREE_CODE (field) != FIELD_DECL)
14914               continue;
14915
14916             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14917             if (sub_count < 0)
14918               return -1;
14919             count = count > sub_count ? count : sub_count;
14920           }
14921
14922         /* There must be no padding.  */
14923         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14924                       count * GET_MODE_BITSIZE (*modep)))
14925           return -1;
14926
14927         return count;
14928       }
14929
14930     default:
14931       break;
14932     }
14933
14934   return -1;
14935 }
14936
14937 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
14938    type as described in AAPCS64 \S 4.1.2.
14939
14940    See the comment above aarch64_composite_type_p for the notes on MODE.  */
14941
14942 static bool
14943 aarch64_short_vector_p (const_tree type,
14944                         machine_mode mode)
14945 {
14946   poly_int64 size = -1;
14947
14948   if (type && TREE_CODE (type) == VECTOR_TYPE)
14949     size = int_size_in_bytes (type);
14950   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
14951             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
14952     size = GET_MODE_SIZE (mode);
14953
14954   return known_eq (size, 8) || known_eq (size, 16);
14955 }
14956
14957 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
14958    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
14959    array types.  The C99 floating-point complex types are also considered
14960    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
14961    types, which are GCC extensions and out of the scope of AAPCS64, are
14962    treated as composite types here as well.
14963
14964    Note that MODE itself is not sufficient in determining whether a type
14965    is such a composite type or not.  This is because
14966    stor-layout.c:compute_record_mode may have already changed the MODE
14967    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
14968    structure with only one field may have its MODE set to the mode of the
14969    field.  Also an integer mode whose size matches the size of the
14970    RECORD_TYPE type may be used to substitute the original mode
14971    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
14972    solely relied on.  */
14973
14974 static bool
14975 aarch64_composite_type_p (const_tree type,
14976                           machine_mode mode)
14977 {
14978   if (aarch64_short_vector_p (type, mode))
14979     return false;
14980
14981   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
14982     return true;
14983
14984   if (mode == BLKmode
14985       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
14986       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
14987     return true;
14988
14989   return false;
14990 }
14991
14992 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
14993    shall be passed or returned in simd/fp register(s) (providing these
14994    parameter passing registers are available).
14995
14996    Upon successful return, *COUNT returns the number of needed registers,
14997    *BASE_MODE returns the mode of the individual register and when IS_HAF
14998    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
14999    floating-point aggregate or a homogeneous short-vector aggregate.  */
15000
15001 static bool
15002 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
15003                                          const_tree type,
15004                                          machine_mode *base_mode,
15005                                          int *count,
15006                                          bool *is_ha)
15007 {
15008   machine_mode new_mode = VOIDmode;
15009   bool composite_p = aarch64_composite_type_p (type, mode);
15010
15011   if (is_ha != NULL) *is_ha = false;
15012
15013   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
15014       || aarch64_short_vector_p (type, mode))
15015     {
15016       *count = 1;
15017       new_mode = mode;
15018     }
15019   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
15020     {
15021       if (is_ha != NULL) *is_ha = true;
15022       *count = 2;
15023       new_mode = GET_MODE_INNER (mode);
15024     }
15025   else if (type && composite_p)
15026     {
15027       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
15028
15029       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
15030         {
15031           if (is_ha != NULL) *is_ha = true;
15032           *count = ag_count;
15033         }
15034       else
15035         return false;
15036     }
15037   else
15038     return false;
15039
15040   *base_mode = new_mode;
15041   return true;
15042 }
15043
15044 /* Implement TARGET_STRUCT_VALUE_RTX.  */
15045
15046 static rtx
15047 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
15048                           int incoming ATTRIBUTE_UNUSED)
15049 {
15050   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
15051 }
15052
15053 /* Implements target hook vector_mode_supported_p.  */
15054 static bool
15055 aarch64_vector_mode_supported_p (machine_mode mode)
15056 {
15057   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15058   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
15059 }
15060
15061 /* Return the full-width SVE vector mode for element mode MODE, if one
15062    exists.  */
15063 opt_machine_mode
15064 aarch64_full_sve_mode (scalar_mode mode)
15065 {
15066   switch (mode)
15067     {
15068     case E_DFmode:
15069       return VNx2DFmode;
15070     case E_SFmode:
15071       return VNx4SFmode;
15072     case E_HFmode:
15073       return VNx8HFmode;
15074     case E_DImode:
15075         return VNx2DImode;
15076     case E_SImode:
15077       return VNx4SImode;
15078     case E_HImode:
15079       return VNx8HImode;
15080     case E_QImode:
15081       return VNx16QImode;
15082     default:
15083       return opt_machine_mode ();
15084     }
15085 }
15086
15087 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
15088    if it exists.  */
15089 opt_machine_mode
15090 aarch64_vq_mode (scalar_mode mode)
15091 {
15092   switch (mode)
15093     {
15094     case E_DFmode:
15095       return V2DFmode;
15096     case E_SFmode:
15097       return V4SFmode;
15098     case E_HFmode:
15099       return V8HFmode;
15100     case E_SImode:
15101       return V4SImode;
15102     case E_HImode:
15103       return V8HImode;
15104     case E_QImode:
15105       return V16QImode;
15106     case E_DImode:
15107       return V2DImode;
15108     default:
15109       return opt_machine_mode ();
15110     }
15111 }
15112
15113 /* Return appropriate SIMD container
15114    for MODE within a vector of WIDTH bits.  */
15115 static machine_mode
15116 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
15117 {
15118   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
15119     return aarch64_full_sve_mode (mode).else_mode (word_mode);
15120
15121   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
15122   if (TARGET_SIMD)
15123     {
15124       if (known_eq (width, 128))
15125         return aarch64_vq_mode (mode).else_mode (word_mode);
15126       else
15127         switch (mode)
15128           {
15129           case E_SFmode:
15130             return V2SFmode;
15131           case E_HFmode:
15132             return V4HFmode;
15133           case E_SImode:
15134             return V2SImode;
15135           case E_HImode:
15136             return V4HImode;
15137           case E_QImode:
15138             return V8QImode;
15139           default:
15140             break;
15141           }
15142     }
15143   return word_mode;
15144 }
15145
15146 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
15147 static machine_mode
15148 aarch64_preferred_simd_mode (scalar_mode mode)
15149 {
15150   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
15151   return aarch64_simd_container_mode (mode, bits);
15152 }
15153
15154 /* Return a list of possible vector sizes for the vectorizer
15155    to iterate over.  */
15156 static void
15157 aarch64_autovectorize_vector_sizes (vector_sizes *sizes, bool)
15158 {
15159   if (TARGET_SVE)
15160     sizes->safe_push (BYTES_PER_SVE_VECTOR);
15161   sizes->safe_push (16);
15162   sizes->safe_push (8);
15163 }
15164
15165 /* Implement TARGET_MANGLE_TYPE.  */
15166
15167 static const char *
15168 aarch64_mangle_type (const_tree type)
15169 {
15170   /* The AArch64 ABI documents say that "__va_list" has to be
15171      mangled as if it is in the "std" namespace.  */
15172   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
15173     return "St9__va_list";
15174
15175   /* Half-precision float.  */
15176   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
15177     return "Dh";
15178
15179   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
15180      builtin types.  */
15181   if (TYPE_NAME (type) != NULL)
15182     return aarch64_general_mangle_builtin_type (type);
15183
15184   /* Use the default mangling.  */
15185   return NULL;
15186 }
15187
15188 /* Find the first rtx_insn before insn that will generate an assembly
15189    instruction.  */
15190
15191 static rtx_insn *
15192 aarch64_prev_real_insn (rtx_insn *insn)
15193 {
15194   if (!insn)
15195     return NULL;
15196
15197   do
15198     {
15199       insn = prev_real_insn (insn);
15200     }
15201   while (insn && recog_memoized (insn) < 0);
15202
15203   return insn;
15204 }
15205
15206 static bool
15207 is_madd_op (enum attr_type t1)
15208 {
15209   unsigned int i;
15210   /* A number of these may be AArch32 only.  */
15211   enum attr_type mlatypes[] = {
15212     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
15213     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
15214     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
15215   };
15216
15217   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
15218     {
15219       if (t1 == mlatypes[i])
15220         return true;
15221     }
15222
15223   return false;
15224 }
15225
15226 /* Check if there is a register dependency between a load and the insn
15227    for which we hold recog_data.  */
15228
15229 static bool
15230 dep_between_memop_and_curr (rtx memop)
15231 {
15232   rtx load_reg;
15233   int opno;
15234
15235   gcc_assert (GET_CODE (memop) == SET);
15236
15237   if (!REG_P (SET_DEST (memop)))
15238     return false;
15239
15240   load_reg = SET_DEST (memop);
15241   for (opno = 1; opno < recog_data.n_operands; opno++)
15242     {
15243       rtx operand = recog_data.operand[opno];
15244       if (REG_P (operand)
15245           && reg_overlap_mentioned_p (load_reg, operand))
15246         return true;
15247
15248     }
15249   return false;
15250 }
15251
15252
15253 /* When working around the Cortex-A53 erratum 835769,
15254    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
15255    instruction and has a preceding memory instruction such that a NOP
15256    should be inserted between them.  */
15257
15258 bool
15259 aarch64_madd_needs_nop (rtx_insn* insn)
15260 {
15261   enum attr_type attr_type;
15262   rtx_insn *prev;
15263   rtx body;
15264
15265   if (!TARGET_FIX_ERR_A53_835769)
15266     return false;
15267
15268   if (!INSN_P (insn) || recog_memoized (insn) < 0)
15269     return false;
15270
15271   attr_type = get_attr_type (insn);
15272   if (!is_madd_op (attr_type))
15273     return false;
15274
15275   prev = aarch64_prev_real_insn (insn);
15276   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
15277      Restore recog state to INSN to avoid state corruption.  */
15278   extract_constrain_insn_cached (insn);
15279
15280   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
15281     return false;
15282
15283   body = single_set (prev);
15284
15285   /* If the previous insn is a memory op and there is no dependency between
15286      it and the DImode madd, emit a NOP between them.  If body is NULL then we
15287      have a complex memory operation, probably a load/store pair.
15288      Be conservative for now and emit a NOP.  */
15289   if (GET_MODE (recog_data.operand[0]) == DImode
15290       && (!body || !dep_between_memop_and_curr (body)))
15291     return true;
15292
15293   return false;
15294
15295 }
15296
15297
15298 /* Implement FINAL_PRESCAN_INSN.  */
15299
15300 void
15301 aarch64_final_prescan_insn (rtx_insn *insn)
15302 {
15303   if (aarch64_madd_needs_nop (insn))
15304     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
15305 }
15306
15307
15308 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
15309    instruction.  */
15310
15311 bool
15312 aarch64_sve_index_immediate_p (rtx base_or_step)
15313 {
15314   return (CONST_INT_P (base_or_step)
15315           && IN_RANGE (INTVAL (base_or_step), -16, 15));
15316 }
15317
15318 /* Return true if X is a valid immediate for the SVE ADD and SUB
15319    instructions.  Negate X first if NEGATE_P is true.  */
15320
15321 bool
15322 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
15323 {
15324   rtx elt;
15325
15326   if (!const_vec_duplicate_p (x, &elt)
15327       || !CONST_INT_P (elt))
15328     return false;
15329
15330   HOST_WIDE_INT val = INTVAL (elt);
15331   if (negate_p)
15332     val = -val;
15333   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
15334
15335   if (val & 0xff)
15336     return IN_RANGE (val, 0, 0xff);
15337   return IN_RANGE (val, 0, 0xff00);
15338 }
15339
15340 /* Return true if X is a valid immediate operand for an SVE logical
15341    instruction such as AND.  */
15342
15343 bool
15344 aarch64_sve_bitmask_immediate_p (rtx x)
15345 {
15346   rtx elt;
15347
15348   return (const_vec_duplicate_p (x, &elt)
15349           && CONST_INT_P (elt)
15350           && aarch64_bitmask_imm (INTVAL (elt),
15351                                   GET_MODE_INNER (GET_MODE (x))));
15352 }
15353
15354 /* Return true if X is a valid immediate for the SVE DUP and CPY
15355    instructions.  */
15356
15357 bool
15358 aarch64_sve_dup_immediate_p (rtx x)
15359 {
15360   x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
15361   if (!CONST_INT_P (x))
15362     return false;
15363
15364   HOST_WIDE_INT val = INTVAL (x);
15365   if (val & 0xff)
15366     return IN_RANGE (val, -0x80, 0x7f);
15367   return IN_RANGE (val, -0x8000, 0x7f00);
15368 }
15369
15370 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
15371    SIGNED_P says whether the operand is signed rather than unsigned.  */
15372
15373 bool
15374 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
15375 {
15376   rtx elt;
15377
15378   return (const_vec_duplicate_p (x, &elt)
15379           && CONST_INT_P (elt)
15380           && (signed_p
15381               ? IN_RANGE (INTVAL (elt), -16, 15)
15382               : IN_RANGE (INTVAL (elt), 0, 127)));
15383 }
15384
15385 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
15386    instruction.  Negate X first if NEGATE_P is true.  */
15387
15388 bool
15389 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
15390 {
15391   rtx elt;
15392   REAL_VALUE_TYPE r;
15393
15394   if (!const_vec_duplicate_p (x, &elt)
15395       || GET_CODE (elt) != CONST_DOUBLE)
15396     return false;
15397
15398   r = *CONST_DOUBLE_REAL_VALUE (elt);
15399
15400   if (negate_p)
15401     r = real_value_negate (&r);
15402
15403   if (real_equal (&r, &dconst1))
15404     return true;
15405   if (real_equal (&r, &dconsthalf))
15406     return true;
15407   return false;
15408 }
15409
15410 /* Return true if X is a valid immediate operand for an SVE FMUL
15411    instruction.  */
15412
15413 bool
15414 aarch64_sve_float_mul_immediate_p (rtx x)
15415 {
15416   rtx elt;
15417
15418   return (const_vec_duplicate_p (x, &elt)
15419           && GET_CODE (elt) == CONST_DOUBLE
15420           && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
15421               || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
15422 }
15423
15424 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
15425    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
15426    is nonnull, use it to describe valid immediates.  */
15427 static bool
15428 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
15429                                     simd_immediate_info *info,
15430                                     enum simd_immediate_check which,
15431                                     simd_immediate_info::insn_type insn)
15432 {
15433   /* Try a 4-byte immediate with LSL.  */
15434   for (unsigned int shift = 0; shift < 32; shift += 8)
15435     if ((val32 & (0xff << shift)) == val32)
15436       {
15437         if (info)
15438           *info = simd_immediate_info (SImode, val32 >> shift, insn,
15439                                        simd_immediate_info::LSL, shift);
15440         return true;
15441       }
15442
15443   /* Try a 2-byte immediate with LSL.  */
15444   unsigned int imm16 = val32 & 0xffff;
15445   if (imm16 == (val32 >> 16))
15446     for (unsigned int shift = 0; shift < 16; shift += 8)
15447       if ((imm16 & (0xff << shift)) == imm16)
15448         {
15449           if (info)
15450             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
15451                                          simd_immediate_info::LSL, shift);
15452           return true;
15453         }
15454
15455   /* Try a 4-byte immediate with MSL, except for cases that MVN
15456      can handle.  */
15457   if (which == AARCH64_CHECK_MOV)
15458     for (unsigned int shift = 8; shift < 24; shift += 8)
15459       {
15460         unsigned int low = (1 << shift) - 1;
15461         if (((val32 & (0xff << shift)) | low) == val32)
15462           {
15463             if (info)
15464               *info = simd_immediate_info (SImode, val32 >> shift, insn,
15465                                            simd_immediate_info::MSL, shift);
15466             return true;
15467           }
15468       }
15469
15470   return false;
15471 }
15472
15473 /* Return true if replicating VAL64 is a valid immediate for the
15474    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
15475    use it to describe valid immediates.  */
15476 static bool
15477 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
15478                                  simd_immediate_info *info,
15479                                  enum simd_immediate_check which)
15480 {
15481   unsigned int val32 = val64 & 0xffffffff;
15482   unsigned int val16 = val64 & 0xffff;
15483   unsigned int val8 = val64 & 0xff;
15484
15485   if (val32 == (val64 >> 32))
15486     {
15487       if ((which & AARCH64_CHECK_ORR) != 0
15488           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
15489                                                  simd_immediate_info::MOV))
15490         return true;
15491
15492       if ((which & AARCH64_CHECK_BIC) != 0
15493           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
15494                                                  simd_immediate_info::MVN))
15495         return true;
15496
15497       /* Try using a replicated byte.  */
15498       if (which == AARCH64_CHECK_MOV
15499           && val16 == (val32 >> 16)
15500           && val8 == (val16 >> 8))
15501         {
15502           if (info)
15503             *info = simd_immediate_info (QImode, val8);
15504           return true;
15505         }
15506     }
15507
15508   /* Try using a bit-to-bytemask.  */
15509   if (which == AARCH64_CHECK_MOV)
15510     {
15511       unsigned int i;
15512       for (i = 0; i < 64; i += 8)
15513         {
15514           unsigned char byte = (val64 >> i) & 0xff;
15515           if (byte != 0 && byte != 0xff)
15516             break;
15517         }
15518       if (i == 64)
15519         {
15520           if (info)
15521             *info = simd_immediate_info (DImode, val64);
15522           return true;
15523         }
15524     }
15525   return false;
15526 }
15527
15528 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
15529    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
15530
15531 static bool
15532 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
15533                              simd_immediate_info *info)
15534 {
15535   scalar_int_mode mode = DImode;
15536   unsigned int val32 = val64 & 0xffffffff;
15537   if (val32 == (val64 >> 32))
15538     {
15539       mode = SImode;
15540       unsigned int val16 = val32 & 0xffff;
15541       if (val16 == (val32 >> 16))
15542         {
15543           mode = HImode;
15544           unsigned int val8 = val16 & 0xff;
15545           if (val8 == (val16 >> 8))
15546             mode = QImode;
15547         }
15548     }
15549   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
15550   if (IN_RANGE (val, -0x80, 0x7f))
15551     {
15552       /* DUP with no shift.  */
15553       if (info)
15554         *info = simd_immediate_info (mode, val);
15555       return true;
15556     }
15557   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
15558     {
15559       /* DUP with LSL #8.  */
15560       if (info)
15561         *info = simd_immediate_info (mode, val);
15562       return true;
15563     }
15564   if (aarch64_bitmask_imm (val64, mode))
15565     {
15566       /* DUPM.  */
15567       if (info)
15568         *info = simd_immediate_info (mode, val);
15569       return true;
15570     }
15571   return false;
15572 }
15573
15574 /* Return true if X is a valid SVE predicate.  If INFO is nonnull, use
15575    it to describe valid immediates.  */
15576
15577 static bool
15578 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
15579 {
15580   if (x == CONST0_RTX (GET_MODE (x)))
15581     {
15582       if (info)
15583         *info = simd_immediate_info (DImode, 0);
15584       return true;
15585     }
15586
15587   /* Analyze the value as a VNx16BImode.  This should be relatively
15588      efficient, since rtx_vector_builder has enough built-in capacity
15589      to store all VLA predicate constants without needing the heap.  */
15590   rtx_vector_builder builder;
15591   if (!aarch64_get_sve_pred_bits (builder, x))
15592     return false;
15593
15594   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
15595   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
15596     {
15597       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
15598       aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
15599       if (pattern != AARCH64_NUM_SVPATTERNS)
15600         {
15601           if (info)
15602             {
15603               scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
15604               *info = simd_immediate_info (int_mode, pattern);
15605             }
15606           return true;
15607         }
15608     }
15609   return false;
15610 }
15611
15612 /* Return true if OP is a valid SIMD immediate for the operation
15613    described by WHICH.  If INFO is nonnull, use it to describe valid
15614    immediates.  */
15615 bool
15616 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
15617                               enum simd_immediate_check which)
15618 {
15619   machine_mode mode = GET_MODE (op);
15620   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15621   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15622     return false;
15623
15624   if (vec_flags & VEC_SVE_PRED)
15625     return aarch64_sve_pred_valid_immediate (op, info);
15626
15627   scalar_mode elt_mode = GET_MODE_INNER (mode);
15628   rtx base, step;
15629   unsigned int n_elts;
15630   if (GET_CODE (op) == CONST_VECTOR
15631       && CONST_VECTOR_DUPLICATE_P (op))
15632     n_elts = CONST_VECTOR_NPATTERNS (op);
15633   else if ((vec_flags & VEC_SVE_DATA)
15634            && const_vec_series_p (op, &base, &step))
15635     {
15636       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
15637       if (!aarch64_sve_index_immediate_p (base)
15638           || !aarch64_sve_index_immediate_p (step))
15639         return false;
15640
15641       if (info)
15642         *info = simd_immediate_info (elt_mode, base, step);
15643       return true;
15644     }
15645   else if (GET_CODE (op) == CONST_VECTOR
15646            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
15647     /* N_ELTS set above.  */;
15648   else
15649     return false;
15650
15651   scalar_float_mode elt_float_mode;
15652   if (n_elts == 1
15653       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
15654     {
15655       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
15656       if (aarch64_float_const_zero_rtx_p (elt)
15657           || aarch64_float_const_representable_p (elt))
15658         {
15659           if (info)
15660             *info = simd_immediate_info (elt_float_mode, elt);
15661           return true;
15662         }
15663     }
15664
15665   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
15666   if (elt_size > 8)
15667     return false;
15668
15669   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
15670
15671   /* Expand the vector constant out into a byte vector, with the least
15672      significant byte of the register first.  */
15673   auto_vec<unsigned char, 16> bytes;
15674   bytes.reserve (n_elts * elt_size);
15675   for (unsigned int i = 0; i < n_elts; i++)
15676     {
15677       /* The vector is provided in gcc endian-neutral fashion.
15678          For aarch64_be Advanced SIMD, it must be laid out in the vector
15679          register in reverse order.  */
15680       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
15681       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
15682
15683       if (elt_mode != elt_int_mode)
15684         elt = gen_lowpart (elt_int_mode, elt);
15685
15686       if (!CONST_INT_P (elt))
15687         return false;
15688
15689       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
15690       for (unsigned int byte = 0; byte < elt_size; byte++)
15691         {
15692           bytes.quick_push (elt_val & 0xff);
15693           elt_val >>= BITS_PER_UNIT;
15694         }
15695     }
15696
15697   /* The immediate must repeat every eight bytes.  */
15698   unsigned int nbytes = bytes.length ();
15699   for (unsigned i = 8; i < nbytes; ++i)
15700     if (bytes[i] != bytes[i - 8])
15701       return false;
15702
15703   /* Get the repeating 8-byte value as an integer.  No endian correction
15704      is needed here because bytes is already in lsb-first order.  */
15705   unsigned HOST_WIDE_INT val64 = 0;
15706   for (unsigned int i = 0; i < 8; i++)
15707     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
15708               << (i * BITS_PER_UNIT));
15709
15710   if (vec_flags & VEC_SVE_DATA)
15711     return aarch64_sve_valid_immediate (val64, info);
15712   else
15713     return aarch64_advsimd_valid_immediate (val64, info, which);
15714 }
15715
15716 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
15717    has a step in the range of INDEX.  Return the index expression if so,
15718    otherwise return null.  */
15719 rtx
15720 aarch64_check_zero_based_sve_index_immediate (rtx x)
15721 {
15722   rtx base, step;
15723   if (const_vec_series_p (x, &base, &step)
15724       && base == const0_rtx
15725       && aarch64_sve_index_immediate_p (step))
15726     return step;
15727   return NULL_RTX;
15728 }
15729
15730 /* Check of immediate shift constants are within range.  */
15731 bool
15732 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
15733 {
15734   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
15735   if (left)
15736     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
15737   else
15738     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
15739 }
15740
15741 /* Return the bitmask CONST_INT to select the bits required by a zero extract
15742    operation of width WIDTH at bit position POS.  */
15743
15744 rtx
15745 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
15746 {
15747   gcc_assert (CONST_INT_P (width));
15748   gcc_assert (CONST_INT_P (pos));
15749
15750   unsigned HOST_WIDE_INT mask
15751     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
15752   return GEN_INT (mask << UINTVAL (pos));
15753 }
15754
15755 bool
15756 aarch64_mov_operand_p (rtx x, machine_mode mode)
15757 {
15758   if (GET_CODE (x) == HIGH
15759       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
15760     return true;
15761
15762   if (CONST_INT_P (x))
15763     return true;
15764
15765   if (VECTOR_MODE_P (GET_MODE (x)))
15766     {
15767       /* Require predicate constants to be VNx16BI before RA, so that we
15768          force everything to have a canonical form.  */
15769       if (!lra_in_progress
15770           && !reload_completed
15771           && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
15772           && GET_MODE (x) != VNx16BImode)
15773         return false;
15774
15775       return aarch64_simd_valid_immediate (x, NULL);
15776     }
15777
15778   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
15779     return true;
15780
15781   if (aarch64_sve_cnt_immediate_p (x))
15782     return true;
15783
15784   return aarch64_classify_symbolic_expression (x)
15785     == SYMBOL_TINY_ABSOLUTE;
15786 }
15787
15788 /* Return a const_int vector of VAL.  */
15789 rtx
15790 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
15791 {
15792   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
15793   return gen_const_vec_duplicate (mode, c);
15794 }
15795
15796 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
15797
15798 bool
15799 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
15800 {
15801   machine_mode vmode;
15802
15803   vmode = aarch64_simd_container_mode (mode, 64);
15804   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
15805   return aarch64_simd_valid_immediate (op_v, NULL);
15806 }
15807
15808 /* Construct and return a PARALLEL RTX vector with elements numbering the
15809    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
15810    the vector - from the perspective of the architecture.  This does not
15811    line up with GCC's perspective on lane numbers, so we end up with
15812    different masks depending on our target endian-ness.  The diagram
15813    below may help.  We must draw the distinction when building masks
15814    which select one half of the vector.  An instruction selecting
15815    architectural low-lanes for a big-endian target, must be described using
15816    a mask selecting GCC high-lanes.
15817
15818                  Big-Endian             Little-Endian
15819
15820 GCC             0   1   2   3           3   2   1   0
15821               | x | x | x | x |       | x | x | x | x |
15822 Architecture    3   2   1   0           3   2   1   0
15823
15824 Low Mask:         { 2, 3 }                { 0, 1 }
15825 High Mask:        { 0, 1 }                { 2, 3 }
15826
15827    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
15828
15829 rtx
15830 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
15831 {
15832   rtvec v = rtvec_alloc (nunits / 2);
15833   int high_base = nunits / 2;
15834   int low_base = 0;
15835   int base;
15836   rtx t1;
15837   int i;
15838
15839   if (BYTES_BIG_ENDIAN)
15840     base = high ? low_base : high_base;
15841   else
15842     base = high ? high_base : low_base;
15843
15844   for (i = 0; i < nunits / 2; i++)
15845     RTVEC_ELT (v, i) = GEN_INT (base + i);
15846
15847   t1 = gen_rtx_PARALLEL (mode, v);
15848   return t1;
15849 }
15850
15851 /* Check OP for validity as a PARALLEL RTX vector with elements
15852    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
15853    from the perspective of the architecture.  See the diagram above
15854    aarch64_simd_vect_par_cnst_half for more details.  */
15855
15856 bool
15857 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
15858                                        bool high)
15859 {
15860   int nelts;
15861   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
15862     return false;
15863
15864   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
15865   HOST_WIDE_INT count_op = XVECLEN (op, 0);
15866   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
15867   int i = 0;
15868
15869   if (count_op != count_ideal)
15870     return false;
15871
15872   for (i = 0; i < count_ideal; i++)
15873     {
15874       rtx elt_op = XVECEXP (op, 0, i);
15875       rtx elt_ideal = XVECEXP (ideal, 0, i);
15876
15877       if (!CONST_INT_P (elt_op)
15878           || INTVAL (elt_ideal) != INTVAL (elt_op))
15879         return false;
15880     }
15881   return true;
15882 }
15883
15884 /* Return a PARALLEL containing NELTS elements, with element I equal
15885    to BASE + I * STEP.  */
15886
15887 rtx
15888 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
15889 {
15890   rtvec vec = rtvec_alloc (nelts);
15891   for (unsigned int i = 0; i < nelts; ++i)
15892     RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
15893   return gen_rtx_PARALLEL (VOIDmode, vec);
15894 }
15895
15896 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
15897    series with step STEP.  */
15898
15899 bool
15900 aarch64_stepped_int_parallel_p (rtx op, int step)
15901 {
15902   if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
15903     return false;
15904
15905   unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
15906   for (int i = 1; i < XVECLEN (op, 0); ++i)
15907     if (!CONST_INT_P (XVECEXP (op, 0, i))
15908         || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
15909       return false;
15910
15911   return true;
15912 }
15913
15914 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
15915    HIGH (exclusive).  */
15916 void
15917 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
15918                           const_tree exp)
15919 {
15920   HOST_WIDE_INT lane;
15921   gcc_assert (CONST_INT_P (operand));
15922   lane = INTVAL (operand);
15923
15924   if (lane < low || lane >= high)
15925   {
15926     if (exp)
15927       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
15928     else
15929       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
15930   }
15931 }
15932
15933 /* Peform endian correction on lane number N, which indexes a vector
15934    of mode MODE, and return the result as an SImode rtx.  */
15935
15936 rtx
15937 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
15938 {
15939   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
15940 }
15941
15942 /* Return TRUE if OP is a valid vector addressing mode.  */
15943
15944 bool
15945 aarch64_simd_mem_operand_p (rtx op)
15946 {
15947   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
15948                         || REG_P (XEXP (op, 0)));
15949 }
15950
15951 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
15952
15953 bool
15954 aarch64_sve_ld1r_operand_p (rtx op)
15955 {
15956   struct aarch64_address_info addr;
15957   scalar_mode mode;
15958
15959   return (MEM_P (op)
15960           && is_a <scalar_mode> (GET_MODE (op), &mode)
15961           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
15962           && addr.type == ADDRESS_REG_IMM
15963           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
15964 }
15965
15966 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
15967 bool
15968 aarch64_sve_ld1rq_operand_p (rtx op)
15969 {
15970   struct aarch64_address_info addr;
15971   scalar_mode elem_mode = GET_MODE_INNER (GET_MODE (op));
15972   if (!MEM_P (op)
15973       || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
15974     return false;
15975
15976   if (addr.type == ADDRESS_REG_IMM)
15977     return offset_4bit_signed_scaled_p (TImode, addr.const_offset);
15978
15979   if (addr.type == ADDRESS_REG_REG)
15980     return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
15981
15982   return false;
15983 }
15984
15985 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
15986    The conditions for STR are the same.  */
15987 bool
15988 aarch64_sve_ldr_operand_p (rtx op)
15989 {
15990   struct aarch64_address_info addr;
15991
15992   return (MEM_P (op)
15993           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
15994                                        false, ADDR_QUERY_ANY)
15995           && addr.type == ADDRESS_REG_IMM);
15996 }
15997
15998 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
15999    We need to be able to access the individual pieces, so the range
16000    is different from LD[234] and ST[234].  */
16001 bool
16002 aarch64_sve_struct_memory_operand_p (rtx op)
16003 {
16004   if (!MEM_P (op))
16005     return false;
16006
16007   machine_mode mode = GET_MODE (op);
16008   struct aarch64_address_info addr;
16009   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
16010                                  ADDR_QUERY_ANY)
16011       || addr.type != ADDRESS_REG_IMM)
16012     return false;
16013
16014   poly_int64 first = addr.const_offset;
16015   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
16016   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
16017           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
16018 }
16019
16020 /* Emit a register copy from operand to operand, taking care not to
16021    early-clobber source registers in the process.
16022
16023    COUNT is the number of components into which the copy needs to be
16024    decomposed.  */
16025 void
16026 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
16027                                 unsigned int count)
16028 {
16029   unsigned int i;
16030   int rdest = REGNO (operands[0]);
16031   int rsrc = REGNO (operands[1]);
16032
16033   if (!reg_overlap_mentioned_p (operands[0], operands[1])
16034       || rdest < rsrc)
16035     for (i = 0; i < count; i++)
16036       emit_move_insn (gen_rtx_REG (mode, rdest + i),
16037                       gen_rtx_REG (mode, rsrc + i));
16038   else
16039     for (i = 0; i < count; i++)
16040       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
16041                       gen_rtx_REG (mode, rsrc + count - i - 1));
16042 }
16043
16044 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
16045    one of VSTRUCT modes: OI, CI, or XI.  */
16046 int
16047 aarch64_simd_attr_length_rglist (machine_mode mode)
16048 {
16049   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
16050   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
16051 }
16052
16053 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
16054    alignment of a vector to 128 bits.  SVE predicates have an alignment of
16055    16 bits.  */
16056 static HOST_WIDE_INT
16057 aarch64_simd_vector_alignment (const_tree type)
16058 {
16059   /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
16060      be set for non-predicate vectors of booleans.  Modes are the most
16061      direct way we have of identifying real SVE predicate types.  */
16062   if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
16063     return 16;
16064   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
16065     return 128;
16066   return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
16067 }
16068
16069 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
16070 static poly_uint64
16071 aarch64_vectorize_preferred_vector_alignment (const_tree type)
16072 {
16073   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
16074     {
16075       /* If the length of the vector is fixed, try to align to that length,
16076          otherwise don't try to align at all.  */
16077       HOST_WIDE_INT result;
16078       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
16079         result = TYPE_ALIGN (TREE_TYPE (type));
16080       return result;
16081     }
16082   return TYPE_ALIGN (type);
16083 }
16084
16085 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
16086 static bool
16087 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
16088 {
16089   if (is_packed)
16090     return false;
16091
16092   /* For fixed-length vectors, check that the vectorizer will aim for
16093      full-vector alignment.  This isn't true for generic GCC vectors
16094      that are wider than the ABI maximum of 128 bits.  */
16095   poly_uint64 preferred_alignment =
16096     aarch64_vectorize_preferred_vector_alignment (type);
16097   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
16098       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
16099                    preferred_alignment))
16100     return false;
16101
16102   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
16103   return true;
16104 }
16105
16106 /* Return true if the vector misalignment factor is supported by the
16107    target.  */
16108 static bool
16109 aarch64_builtin_support_vector_misalignment (machine_mode mode,
16110                                              const_tree type, int misalignment,
16111                                              bool is_packed)
16112 {
16113   if (TARGET_SIMD && STRICT_ALIGNMENT)
16114     {
16115       /* Return if movmisalign pattern is not supported for this mode.  */
16116       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
16117         return false;
16118
16119       /* Misalignment factor is unknown at compile time.  */
16120       if (misalignment == -1)
16121         return false;
16122     }
16123   return default_builtin_support_vector_misalignment (mode, type, misalignment,
16124                                                       is_packed);
16125 }
16126
16127 /* If VALS is a vector constant that can be loaded into a register
16128    using DUP, generate instructions to do so and return an RTX to
16129    assign to the register.  Otherwise return NULL_RTX.  */
16130 static rtx
16131 aarch64_simd_dup_constant (rtx vals)
16132 {
16133   machine_mode mode = GET_MODE (vals);
16134   machine_mode inner_mode = GET_MODE_INNER (mode);
16135   rtx x;
16136
16137   if (!const_vec_duplicate_p (vals, &x))
16138     return NULL_RTX;
16139
16140   /* We can load this constant by using DUP and a constant in a
16141      single ARM register.  This will be cheaper than a vector
16142      load.  */
16143   x = copy_to_mode_reg (inner_mode, x);
16144   return gen_vec_duplicate (mode, x);
16145 }
16146
16147
16148 /* Generate code to load VALS, which is a PARALLEL containing only
16149    constants (for vec_init) or CONST_VECTOR, efficiently into a
16150    register.  Returns an RTX to copy into the register, or NULL_RTX
16151    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
16152 static rtx
16153 aarch64_simd_make_constant (rtx vals)
16154 {
16155   machine_mode mode = GET_MODE (vals);
16156   rtx const_dup;
16157   rtx const_vec = NULL_RTX;
16158   int n_const = 0;
16159   int i;
16160
16161   if (GET_CODE (vals) == CONST_VECTOR)
16162     const_vec = vals;
16163   else if (GET_CODE (vals) == PARALLEL)
16164     {
16165       /* A CONST_VECTOR must contain only CONST_INTs and
16166          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
16167          Only store valid constants in a CONST_VECTOR.  */
16168       int n_elts = XVECLEN (vals, 0);
16169       for (i = 0; i < n_elts; ++i)
16170         {
16171           rtx x = XVECEXP (vals, 0, i);
16172           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16173             n_const++;
16174         }
16175       if (n_const == n_elts)
16176         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
16177     }
16178   else
16179     gcc_unreachable ();
16180
16181   if (const_vec != NULL_RTX
16182       && aarch64_simd_valid_immediate (const_vec, NULL))
16183     /* Load using MOVI/MVNI.  */
16184     return const_vec;
16185   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
16186     /* Loaded using DUP.  */
16187     return const_dup;
16188   else if (const_vec != NULL_RTX)
16189     /* Load from constant pool. We cannot take advantage of single-cycle
16190        LD1 because we need a PC-relative addressing mode.  */
16191     return const_vec;
16192   else
16193     /* A PARALLEL containing something not valid inside CONST_VECTOR.
16194        We cannot construct an initializer.  */
16195     return NULL_RTX;
16196 }
16197
16198 /* Expand a vector initialisation sequence, such that TARGET is
16199    initialised to contain VALS.  */
16200
16201 void
16202 aarch64_expand_vector_init (rtx target, rtx vals)
16203 {
16204   machine_mode mode = GET_MODE (target);
16205   scalar_mode inner_mode = GET_MODE_INNER (mode);
16206   /* The number of vector elements.  */
16207   int n_elts = XVECLEN (vals, 0);
16208   /* The number of vector elements which are not constant.  */
16209   int n_var = 0;
16210   rtx any_const = NULL_RTX;
16211   /* The first element of vals.  */
16212   rtx v0 = XVECEXP (vals, 0, 0);
16213   bool all_same = true;
16214
16215   /* This is a special vec_init<M><N> where N is not an element mode but a
16216      vector mode with half the elements of M.  We expect to find two entries
16217      of mode N in VALS and we must put their concatentation into TARGET.  */
16218   if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
16219     {
16220       gcc_assert (known_eq (GET_MODE_SIZE (mode),
16221                   2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
16222       rtx lo = XVECEXP (vals, 0, 0);
16223       rtx hi = XVECEXP (vals, 0, 1);
16224       machine_mode narrow_mode = GET_MODE (lo);
16225       gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
16226       gcc_assert (narrow_mode == GET_MODE (hi));
16227
16228       /* When we want to concatenate a half-width vector with zeroes we can
16229          use the aarch64_combinez[_be] patterns.  Just make sure that the
16230          zeroes are in the right half.  */
16231       if (BYTES_BIG_ENDIAN
16232           && aarch64_simd_imm_zero (lo, narrow_mode)
16233           && general_operand (hi, narrow_mode))
16234         emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
16235       else if (!BYTES_BIG_ENDIAN
16236                && aarch64_simd_imm_zero (hi, narrow_mode)
16237                && general_operand (lo, narrow_mode))
16238         emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
16239       else
16240         {
16241           /* Else create the two half-width registers and combine them.  */
16242           if (!REG_P (lo))
16243             lo = force_reg (GET_MODE (lo), lo);
16244           if (!REG_P (hi))
16245             hi = force_reg (GET_MODE (hi), hi);
16246
16247           if (BYTES_BIG_ENDIAN)
16248             std::swap (lo, hi);
16249           emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
16250         }
16251      return;
16252    }
16253
16254   /* Count the number of variable elements to initialise.  */
16255   for (int i = 0; i < n_elts; ++i)
16256     {
16257       rtx x = XVECEXP (vals, 0, i);
16258       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
16259         ++n_var;
16260       else
16261         any_const = x;
16262
16263       all_same &= rtx_equal_p (x, v0);
16264     }
16265
16266   /* No variable elements, hand off to aarch64_simd_make_constant which knows
16267      how best to handle this.  */
16268   if (n_var == 0)
16269     {
16270       rtx constant = aarch64_simd_make_constant (vals);
16271       if (constant != NULL_RTX)
16272         {
16273           emit_move_insn (target, constant);
16274           return;
16275         }
16276     }
16277
16278   /* Splat a single non-constant element if we can.  */
16279   if (all_same)
16280     {
16281       rtx x = copy_to_mode_reg (inner_mode, v0);
16282       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
16283       return;
16284     }
16285
16286   enum insn_code icode = optab_handler (vec_set_optab, mode);
16287   gcc_assert (icode != CODE_FOR_nothing);
16288
16289   /* If there are only variable elements, try to optimize
16290      the insertion using dup for the most common element
16291      followed by insertions.  */
16292
16293   /* The algorithm will fill matches[*][0] with the earliest matching element,
16294      and matches[X][1] with the count of duplicate elements (if X is the
16295      earliest element which has duplicates).  */
16296
16297   if (n_var == n_elts && n_elts <= 16)
16298     {
16299       int matches[16][2] = {0};
16300       for (int i = 0; i < n_elts; i++)
16301         {
16302           for (int j = 0; j <= i; j++)
16303             {
16304               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
16305                 {
16306                   matches[i][0] = j;
16307                   matches[j][1]++;
16308                   break;
16309                 }
16310             }
16311         }
16312       int maxelement = 0;
16313       int maxv = 0;
16314       for (int i = 0; i < n_elts; i++)
16315         if (matches[i][1] > maxv)
16316           {
16317             maxelement = i;
16318             maxv = matches[i][1];
16319           }
16320
16321       /* Create a duplicate of the most common element, unless all elements
16322          are equally useless to us, in which case just immediately set the
16323          vector register using the first element.  */
16324
16325       if (maxv == 1)
16326         {
16327           /* For vectors of two 64-bit elements, we can do even better.  */
16328           if (n_elts == 2
16329               && (inner_mode == E_DImode
16330                   || inner_mode == E_DFmode))
16331
16332             {
16333               rtx x0 = XVECEXP (vals, 0, 0);
16334               rtx x1 = XVECEXP (vals, 0, 1);
16335               /* Combine can pick up this case, but handling it directly
16336                  here leaves clearer RTL.
16337
16338                  This is load_pair_lanes<mode>, and also gives us a clean-up
16339                  for store_pair_lanes<mode>.  */
16340               if (memory_operand (x0, inner_mode)
16341                   && memory_operand (x1, inner_mode)
16342                   && !STRICT_ALIGNMENT
16343                   && rtx_equal_p (XEXP (x1, 0),
16344                                   plus_constant (Pmode,
16345                                                  XEXP (x0, 0),
16346                                                  GET_MODE_SIZE (inner_mode))))
16347                 {
16348                   rtx t;
16349                   if (inner_mode == DFmode)
16350                     t = gen_load_pair_lanesdf (target, x0, x1);
16351                   else
16352                     t = gen_load_pair_lanesdi (target, x0, x1);
16353                   emit_insn (t);
16354                   return;
16355                 }
16356             }
16357           /* The subreg-move sequence below will move into lane zero of the
16358              vector register.  For big-endian we want that position to hold
16359              the last element of VALS.  */
16360           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
16361           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
16362           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
16363         }
16364       else
16365         {
16366           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
16367           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
16368         }
16369
16370       /* Insert the rest.  */
16371       for (int i = 0; i < n_elts; i++)
16372         {
16373           rtx x = XVECEXP (vals, 0, i);
16374           if (matches[i][0] == maxelement)
16375             continue;
16376           x = copy_to_mode_reg (inner_mode, x);
16377           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
16378         }
16379       return;
16380     }
16381
16382   /* Initialise a vector which is part-variable.  We want to first try
16383      to build those lanes which are constant in the most efficient way we
16384      can.  */
16385   if (n_var != n_elts)
16386     {
16387       rtx copy = copy_rtx (vals);
16388
16389       /* Load constant part of vector.  We really don't care what goes into the
16390          parts we will overwrite, but we're more likely to be able to load the
16391          constant efficiently if it has fewer, larger, repeating parts
16392          (see aarch64_simd_valid_immediate).  */
16393       for (int i = 0; i < n_elts; i++)
16394         {
16395           rtx x = XVECEXP (vals, 0, i);
16396           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16397             continue;
16398           rtx subst = any_const;
16399           for (int bit = n_elts / 2; bit > 0; bit /= 2)
16400             {
16401               /* Look in the copied vector, as more elements are const.  */
16402               rtx test = XVECEXP (copy, 0, i ^ bit);
16403               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
16404                 {
16405                   subst = test;
16406                   break;
16407                 }
16408             }
16409           XVECEXP (copy, 0, i) = subst;
16410         }
16411       aarch64_expand_vector_init (target, copy);
16412     }
16413
16414   /* Insert the variable lanes directly.  */
16415   for (int i = 0; i < n_elts; i++)
16416     {
16417       rtx x = XVECEXP (vals, 0, i);
16418       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16419         continue;
16420       x = copy_to_mode_reg (inner_mode, x);
16421       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
16422     }
16423 }
16424
16425 /* Emit RTL corresponding to:
16426    insr TARGET, ELEM.  */
16427
16428 static void
16429 emit_insr (rtx target, rtx elem)
16430 {
16431   machine_mode mode = GET_MODE (target);
16432   scalar_mode elem_mode = GET_MODE_INNER (mode);
16433   elem = force_reg (elem_mode, elem);
16434
16435   insn_code icode = optab_handler (vec_shl_insert_optab, mode);
16436   gcc_assert (icode != CODE_FOR_nothing);
16437   emit_insn (GEN_FCN (icode) (target, target, elem));
16438 }
16439
16440 /* Subroutine of aarch64_sve_expand_vector_init for handling
16441    trailing constants.
16442    This function works as follows:
16443    (a) Create a new vector consisting of trailing constants.
16444    (b) Initialize TARGET with the constant vector using emit_move_insn.
16445    (c) Insert remaining elements in TARGET using insr.
16446    NELTS is the total number of elements in original vector while
16447    while NELTS_REQD is the number of elements that are actually
16448    significant.
16449
16450    ??? The heuristic used is to do above only if number of constants
16451    is at least half the total number of elements.  May need fine tuning.  */
16452
16453 static bool
16454 aarch64_sve_expand_vector_init_handle_trailing_constants
16455  (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
16456 {
16457   machine_mode mode = GET_MODE (target);
16458   scalar_mode elem_mode = GET_MODE_INNER (mode);
16459   int n_trailing_constants = 0;
16460
16461   for (int i = nelts_reqd - 1;
16462        i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
16463        i--)
16464     n_trailing_constants++;
16465
16466   if (n_trailing_constants >= nelts_reqd / 2)
16467     {
16468       rtx_vector_builder v (mode, 1, nelts);
16469       for (int i = 0; i < nelts; i++)
16470         v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
16471       rtx const_vec = v.build ();
16472       emit_move_insn (target, const_vec);
16473
16474       for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
16475         emit_insr (target, builder.elt (i));
16476
16477       return true;
16478     }
16479
16480   return false;
16481 }
16482
16483 /* Subroutine of aarch64_sve_expand_vector_init.
16484    Works as follows:
16485    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
16486    (b) Skip trailing elements from BUILDER, which are the same as
16487        element NELTS_REQD - 1.
16488    (c) Insert earlier elements in reverse order in TARGET using insr.  */
16489
16490 static void
16491 aarch64_sve_expand_vector_init_insert_elems (rtx target,
16492                                              const rtx_vector_builder &builder,
16493                                              int nelts_reqd)
16494 {
16495   machine_mode mode = GET_MODE (target);
16496   scalar_mode elem_mode = GET_MODE_INNER (mode);
16497
16498   struct expand_operand ops[2];
16499   enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
16500   gcc_assert (icode != CODE_FOR_nothing);
16501
16502   create_output_operand (&ops[0], target, mode);
16503   create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
16504   expand_insn (icode, 2, ops);
16505
16506   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16507   for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
16508     emit_insr (target, builder.elt (i));
16509 }
16510
16511 /* Subroutine of aarch64_sve_expand_vector_init to handle case
16512    when all trailing elements of builder are same.
16513    This works as follows:
16514    (a) Use expand_insn interface to broadcast last vector element in TARGET.
16515    (b) Insert remaining elements in TARGET using insr.
16516
16517    ??? The heuristic used is to do above if number of same trailing elements
16518    is at least 3/4 of total number of elements, loosely based on
16519    heuristic from mostly_zeros_p.  May need fine-tuning.  */
16520
16521 static bool
16522 aarch64_sve_expand_vector_init_handle_trailing_same_elem
16523  (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
16524 {
16525   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16526   if (ndups >= (3 * nelts_reqd) / 4)
16527     {
16528       aarch64_sve_expand_vector_init_insert_elems (target, builder,
16529                                                    nelts_reqd - ndups + 1);
16530       return true;
16531     }
16532
16533   return false;
16534 }
16535
16536 /* Initialize register TARGET from BUILDER. NELTS is the constant number
16537    of elements in BUILDER.
16538
16539    The function tries to initialize TARGET from BUILDER if it fits one
16540    of the special cases outlined below.
16541
16542    Failing that, the function divides BUILDER into two sub-vectors:
16543    v_even = even elements of BUILDER;
16544    v_odd = odd elements of BUILDER;
16545
16546    and recursively calls itself with v_even and v_odd.
16547
16548    if (recursive call succeeded for v_even or v_odd)
16549      TARGET = zip (v_even, v_odd)
16550
16551    The function returns true if it managed to build TARGET from BUILDER
16552    with one of the special cases, false otherwise.
16553
16554    Example: {a, 1, b, 2, c, 3, d, 4}
16555
16556    The vector gets divided into:
16557    v_even = {a, b, c, d}
16558    v_odd = {1, 2, 3, 4}
16559
16560    aarch64_sve_expand_vector_init(v_odd) hits case 1 and
16561    initialize tmp2 from constant vector v_odd using emit_move_insn.
16562
16563    aarch64_sve_expand_vector_init(v_even) fails since v_even contains
16564    4 elements, so we construct tmp1 from v_even using insr:
16565    tmp1 = dup(d)
16566    insr tmp1, c
16567    insr tmp1, b
16568    insr tmp1, a
16569
16570    And finally:
16571    TARGET = zip (tmp1, tmp2)
16572    which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
16573
16574 static bool
16575 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
16576                                 int nelts, int nelts_reqd)
16577 {
16578   machine_mode mode = GET_MODE (target);
16579
16580   /* Case 1: Vector contains trailing constants.  */
16581
16582   if (aarch64_sve_expand_vector_init_handle_trailing_constants
16583        (target, builder, nelts, nelts_reqd))
16584     return true;
16585
16586   /* Case 2: Vector contains leading constants.  */
16587
16588   rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
16589   for (int i = 0; i < nelts_reqd; i++)
16590     rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
16591   rev_builder.finalize ();
16592
16593   if (aarch64_sve_expand_vector_init_handle_trailing_constants
16594        (target, rev_builder, nelts, nelts_reqd))
16595     {
16596       emit_insn (gen_aarch64_sve_rev (mode, target, target));
16597       return true;
16598     }
16599
16600   /* Case 3: Vector contains trailing same element.  */
16601
16602   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16603        (target, builder, nelts_reqd))
16604     return true;
16605
16606   /* Case 4: Vector contains leading same element.  */
16607
16608   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16609        (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
16610     {
16611       emit_insn (gen_aarch64_sve_rev (mode, target, target));
16612       return true;
16613     }
16614
16615   /* Avoid recursing below 4-elements.
16616      ??? The threshold 4 may need fine-tuning.  */
16617
16618   if (nelts_reqd <= 4)
16619     return false;
16620
16621   rtx_vector_builder v_even (mode, 1, nelts);
16622   rtx_vector_builder v_odd (mode, 1, nelts);
16623
16624   for (int i = 0; i < nelts * 2; i += 2)
16625     {
16626       v_even.quick_push (builder.elt (i));
16627       v_odd.quick_push (builder.elt (i + 1));
16628     }
16629
16630   v_even.finalize ();
16631   v_odd.finalize ();
16632
16633   rtx tmp1 = gen_reg_rtx (mode);
16634   bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
16635                                                     nelts, nelts_reqd / 2);
16636
16637   rtx tmp2 = gen_reg_rtx (mode);
16638   bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
16639                                                    nelts, nelts_reqd / 2);
16640
16641   if (!did_even_p && !did_odd_p)
16642     return false;
16643
16644   /* Initialize v_even and v_odd using INSR if it didn't match any of the
16645      special cases and zip v_even, v_odd.  */
16646
16647   if (!did_even_p)
16648     aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
16649
16650   if (!did_odd_p)
16651     aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
16652
16653   rtvec v = gen_rtvec (2, tmp1, tmp2);
16654   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
16655   return true;
16656 }
16657
16658 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
16659
16660 void
16661 aarch64_sve_expand_vector_init (rtx target, rtx vals)
16662 {
16663   machine_mode mode = GET_MODE (target);
16664   int nelts = XVECLEN (vals, 0);
16665
16666   rtx_vector_builder v (mode, 1, nelts);
16667   for (int i = 0; i < nelts; i++)
16668     v.quick_push (XVECEXP (vals, 0, i));
16669   v.finalize ();
16670
16671   /* If neither sub-vectors of v could be initialized specially,
16672      then use INSR to insert all elements from v into TARGET.
16673      ??? This might not be optimal for vectors with large
16674      initializers like 16-element or above.
16675      For nelts < 4, it probably isn't useful to handle specially.  */
16676
16677   if (nelts < 4
16678       || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
16679     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
16680 }
16681
16682 /* Check whether VALUE is a vector constant in which every element
16683    is either a power of 2 or a negated power of 2.  If so, return
16684    a constant vector of log2s, and flip CODE between PLUS and MINUS
16685    if VALUE contains negated powers of 2.  Return NULL_RTX otherwise.  */
16686
16687 static rtx
16688 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
16689 {
16690   if (GET_CODE (value) != CONST_VECTOR)
16691     return NULL_RTX;
16692
16693   rtx_vector_builder builder;
16694   if (!builder.new_unary_operation (GET_MODE (value), value, false))
16695     return NULL_RTX;
16696
16697   scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
16698   /* 1 if the result of the multiplication must be negated,
16699      0 if it mustn't, or -1 if we don't yet care.  */
16700   int negate = -1;
16701   unsigned int encoded_nelts = const_vector_encoded_nelts (value);
16702   for (unsigned int i = 0; i < encoded_nelts; ++i)
16703     {
16704       rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
16705       if (!CONST_SCALAR_INT_P (elt))
16706         return NULL_RTX;
16707       rtx_mode_t val (elt, int_mode);
16708       wide_int pow2 = wi::neg (val);
16709       if (val != pow2)
16710         {
16711           /* It matters whether we negate or not.  Make that choice,
16712              and make sure that it's consistent with previous elements.  */
16713           if (negate == !wi::neg_p (val))
16714             return NULL_RTX;
16715           negate = wi::neg_p (val);
16716           if (!negate)
16717             pow2 = val;
16718         }
16719       /* POW2 is now the value that we want to be a power of 2.  */
16720       int shift = wi::exact_log2 (pow2);
16721       if (shift < 0)
16722         return NULL_RTX;
16723       builder.quick_push (gen_int_mode (shift, int_mode));
16724     }
16725   if (negate == -1)
16726     /* PLUS and MINUS are equivalent; canonicalize on PLUS.  */
16727     code = PLUS;
16728   else if (negate == 1)
16729     code = code == PLUS ? MINUS : PLUS;
16730   return builder.build ();
16731 }
16732
16733 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
16734    CODE is PLUS for the former and MINUS for the latter.  OPERANDS is the
16735    operands array, in the same order as for fma_optab.  Return true if
16736    the function emitted all the necessary instructions, false if the caller
16737    should generate the pattern normally with the new OPERANDS array.  */
16738
16739 bool
16740 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
16741 {
16742   machine_mode mode = GET_MODE (operands[0]);
16743   if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
16744     {
16745       rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
16746                                   NULL_RTX, true, OPTAB_DIRECT);
16747       force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
16748                           operands[3], product, operands[0], true,
16749                           OPTAB_DIRECT);
16750       return true;
16751     }
16752   operands[2] = force_reg (mode, operands[2]);
16753   return false;
16754 }
16755
16756 /* Likewise, but for a conditional pattern.  */
16757
16758 bool
16759 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
16760 {
16761   machine_mode mode = GET_MODE (operands[0]);
16762   if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
16763     {
16764       rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
16765                                   NULL_RTX, true, OPTAB_DIRECT);
16766       emit_insn (gen_cond (code, mode, operands[0], operands[1],
16767                            operands[4], product, operands[5]));
16768       return true;
16769     }
16770   operands[3] = force_reg (mode, operands[3]);
16771   return false;
16772 }
16773
16774 static unsigned HOST_WIDE_INT
16775 aarch64_shift_truncation_mask (machine_mode mode)
16776 {
16777   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
16778     return 0;
16779   return GET_MODE_UNIT_BITSIZE (mode) - 1;
16780 }
16781
16782 /* Select a format to encode pointers in exception handling data.  */
16783 int
16784 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
16785 {
16786    int type;
16787    switch (aarch64_cmodel)
16788      {
16789      case AARCH64_CMODEL_TINY:
16790      case AARCH64_CMODEL_TINY_PIC:
16791      case AARCH64_CMODEL_SMALL:
16792      case AARCH64_CMODEL_SMALL_PIC:
16793      case AARCH64_CMODEL_SMALL_SPIC:
16794        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
16795           for everything.  */
16796        type = DW_EH_PE_sdata4;
16797        break;
16798      default:
16799        /* No assumptions here.  8-byte relocs required.  */
16800        type = DW_EH_PE_sdata8;
16801        break;
16802      }
16803    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
16804 }
16805
16806 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
16807
16808 static void
16809 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
16810 {
16811   if (aarch64_simd_decl_p (decl))
16812     {
16813       fprintf (stream, "\t.variant_pcs\t");
16814       assemble_name (stream, name);
16815       fprintf (stream, "\n");
16816     }
16817 }
16818
16819 /* The last .arch and .tune assembly strings that we printed.  */
16820 static std::string aarch64_last_printed_arch_string;
16821 static std::string aarch64_last_printed_tune_string;
16822
16823 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
16824    by the function fndecl.  */
16825
16826 void
16827 aarch64_declare_function_name (FILE *stream, const char* name,
16828                                 tree fndecl)
16829 {
16830   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
16831
16832   struct cl_target_option *targ_options;
16833   if (target_parts)
16834     targ_options = TREE_TARGET_OPTION (target_parts);
16835   else
16836     targ_options = TREE_TARGET_OPTION (target_option_current_node);
16837   gcc_assert (targ_options);
16838
16839   const struct processor *this_arch
16840     = aarch64_get_arch (targ_options->x_explicit_arch);
16841
16842   uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
16843   std::string extension
16844     = aarch64_get_extension_string_for_isa_flags (isa_flags,
16845                                                   this_arch->flags);
16846   /* Only update the assembler .arch string if it is distinct from the last
16847      such string we printed.  */
16848   std::string to_print = this_arch->name + extension;
16849   if (to_print != aarch64_last_printed_arch_string)
16850     {
16851       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
16852       aarch64_last_printed_arch_string = to_print;
16853     }
16854
16855   /* Print the cpu name we're tuning for in the comments, might be
16856      useful to readers of the generated asm.  Do it only when it changes
16857      from function to function and verbose assembly is requested.  */
16858   const struct processor *this_tune
16859     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
16860
16861   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
16862     {
16863       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
16864                    this_tune->name);
16865       aarch64_last_printed_tune_string = this_tune->name;
16866     }
16867
16868   aarch64_asm_output_variant_pcs (stream, fndecl, name);
16869
16870   /* Don't forget the type directive for ELF.  */
16871   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
16872   ASM_OUTPUT_LABEL (stream, name);
16873 }
16874
16875 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
16876
16877 void
16878 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
16879 {
16880   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
16881   const char *value = IDENTIFIER_POINTER (target);
16882   aarch64_asm_output_variant_pcs (stream, decl, name);
16883   ASM_OUTPUT_DEF (stream, name, value);
16884 }
16885
16886 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
16887    function symbol references.  */
16888
16889 void
16890 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
16891 {
16892   default_elf_asm_output_external (stream, decl, name);
16893   aarch64_asm_output_variant_pcs (stream, decl, name);
16894 }
16895
16896 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
16897    Used to output the .cfi_b_key_frame directive when signing the current
16898    function with the B key.  */
16899
16900 void
16901 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
16902 {
16903   if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
16904       && aarch64_ra_sign_key == AARCH64_KEY_B)
16905         asm_fprintf (f, "\t.cfi_b_key_frame\n");
16906 }
16907
16908 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
16909
16910 static void
16911 aarch64_start_file (void)
16912 {
16913   struct cl_target_option *default_options
16914     = TREE_TARGET_OPTION (target_option_default_node);
16915
16916   const struct processor *default_arch
16917     = aarch64_get_arch (default_options->x_explicit_arch);
16918   uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
16919   std::string extension
16920     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
16921                                                   default_arch->flags);
16922
16923    aarch64_last_printed_arch_string = default_arch->name + extension;
16924    aarch64_last_printed_tune_string = "";
16925    asm_fprintf (asm_out_file, "\t.arch %s\n",
16926                 aarch64_last_printed_arch_string.c_str ());
16927
16928    default_file_start ();
16929 }
16930
16931 /* Emit load exclusive.  */
16932
16933 static void
16934 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
16935                              rtx mem, rtx model_rtx)
16936 {
16937   if (mode == TImode)
16938     emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
16939                                                 gen_highpart (DImode, rval),
16940                                                 mem, model_rtx));
16941   else
16942     emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
16943 }
16944
16945 /* Emit store exclusive.  */
16946
16947 static void
16948 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
16949                               rtx mem, rtx rval, rtx model_rtx)
16950 {
16951   if (mode == TImode)
16952     emit_insn (gen_aarch64_store_exclusive_pair
16953                (bval, mem, operand_subword (rval, 0, 0, TImode),
16954                 operand_subword (rval, 1, 0, TImode), model_rtx));
16955   else
16956     emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
16957 }
16958
16959 /* Mark the previous jump instruction as unlikely.  */
16960
16961 static void
16962 aarch64_emit_unlikely_jump (rtx insn)
16963 {
16964   rtx_insn *jump = emit_jump_insn (insn);
16965   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
16966 }
16967
16968 /* We store the names of the various atomic helpers in a 5x4 array.
16969    Return the libcall function given MODE, MODEL and NAMES.  */
16970
16971 rtx
16972 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
16973                         const atomic_ool_names *names)
16974 {
16975   memmodel model = memmodel_base (INTVAL (model_rtx));
16976   int mode_idx, model_idx;
16977
16978   switch (mode)
16979     {
16980     case E_QImode:
16981       mode_idx = 0;
16982       break;
16983     case E_HImode:
16984       mode_idx = 1;
16985       break;
16986     case E_SImode:
16987       mode_idx = 2;
16988       break;
16989     case E_DImode:
16990       mode_idx = 3;
16991       break;
16992     case E_TImode:
16993       mode_idx = 4;
16994       break;
16995     default:
16996       gcc_unreachable ();
16997     }
16998
16999   switch (model)
17000     {
17001     case MEMMODEL_RELAXED:
17002       model_idx = 0;
17003       break;
17004     case MEMMODEL_CONSUME:
17005     case MEMMODEL_ACQUIRE:
17006       model_idx = 1;
17007       break;
17008     case MEMMODEL_RELEASE:
17009       model_idx = 2;
17010       break;
17011     case MEMMODEL_ACQ_REL:
17012     case MEMMODEL_SEQ_CST:
17013       model_idx = 3;
17014       break;
17015     default:
17016       gcc_unreachable ();
17017     }
17018
17019   return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
17020                                       VISIBILITY_HIDDEN);
17021 }
17022
17023 #define DEF0(B, N) \
17024   { "__aarch64_" #B #N "_relax", \
17025     "__aarch64_" #B #N "_acq", \
17026     "__aarch64_" #B #N "_rel", \
17027     "__aarch64_" #B #N "_acq_rel" }
17028
17029 #define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
17030                  { NULL, NULL, NULL, NULL }
17031 #define DEF5(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
17032
17033 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
17034 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
17035 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
17036 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
17037 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
17038 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
17039
17040 #undef DEF0
17041 #undef DEF4
17042 #undef DEF5
17043
17044 /* Expand a compare and swap pattern.  */
17045
17046 void
17047 aarch64_expand_compare_and_swap (rtx operands[])
17048 {
17049   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
17050   machine_mode mode, r_mode;
17051
17052   bval = operands[0];
17053   rval = operands[1];
17054   mem = operands[2];
17055   oldval = operands[3];
17056   newval = operands[4];
17057   is_weak = operands[5];
17058   mod_s = operands[6];
17059   mod_f = operands[7];
17060   mode = GET_MODE (mem);
17061
17062   /* Normally the succ memory model must be stronger than fail, but in the
17063      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
17064      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
17065   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
17066       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
17067     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
17068
17069   r_mode = mode;
17070   if (mode == QImode || mode == HImode)
17071     {
17072       r_mode = SImode;
17073       rval = gen_reg_rtx (r_mode);
17074     }
17075
17076   if (TARGET_LSE)
17077     {
17078       /* The CAS insn requires oldval and rval overlap, but we need to
17079          have a copy of oldval saved across the operation to tell if
17080          the operation is successful.  */
17081       if (reg_overlap_mentioned_p (rval, oldval))
17082         rval = copy_to_mode_reg (r_mode, oldval);
17083       else
17084         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
17085
17086       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
17087                                                    newval, mod_s));
17088       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
17089     }
17090   else if (TARGET_OUTLINE_ATOMICS)
17091     {
17092       /* Oldval must satisfy compare afterward.  */
17093       if (!aarch64_plus_operand (oldval, mode))
17094         oldval = force_reg (mode, oldval);
17095       rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
17096       rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
17097                                       oldval, mode, newval, mode,
17098                                       XEXP (mem, 0), Pmode);
17099       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
17100     }
17101   else
17102     {
17103       /* The oldval predicate varies by mode.  Test it and force to reg.  */
17104       insn_code code = code_for_aarch64_compare_and_swap (mode);
17105       if (!insn_data[code].operand[2].predicate (oldval, mode))
17106         oldval = force_reg (mode, oldval);
17107
17108       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
17109                                  is_weak, mod_s, mod_f));
17110       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
17111     }
17112
17113   if (r_mode != mode)
17114     rval = gen_lowpart (mode, rval);
17115   emit_move_insn (operands[1], rval);
17116
17117   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
17118   emit_insn (gen_rtx_SET (bval, x));
17119 }
17120
17121 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
17122    sequence implementing an atomic operation.  */
17123
17124 static void
17125 aarch64_emit_post_barrier (enum memmodel model)
17126 {
17127   const enum memmodel base_model = memmodel_base (model);
17128
17129   if (is_mm_sync (model)
17130       && (base_model == MEMMODEL_ACQUIRE
17131           || base_model == MEMMODEL_ACQ_REL
17132           || base_model == MEMMODEL_SEQ_CST))
17133     {
17134       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
17135     }
17136 }
17137
17138 /* Split a compare and swap pattern.  */
17139
17140 void
17141 aarch64_split_compare_and_swap (rtx operands[])
17142 {
17143   rtx rval, mem, oldval, newval, scratch, x, model_rtx;
17144   machine_mode mode;
17145   bool is_weak;
17146   rtx_code_label *label1, *label2;
17147   enum memmodel model;
17148
17149   rval = operands[0];
17150   mem = operands[1];
17151   oldval = operands[2];
17152   newval = operands[3];
17153   is_weak = (operands[4] != const0_rtx);
17154   model_rtx = operands[5];
17155   scratch = operands[7];
17156   mode = GET_MODE (mem);
17157   model = memmodel_from_int (INTVAL (model_rtx));
17158
17159   /* When OLDVAL is zero and we want the strong version we can emit a tighter
17160     loop:
17161     .label1:
17162         LD[A]XR rval, [mem]
17163         CBNZ    rval, .label2
17164         ST[L]XR scratch, newval, [mem]
17165         CBNZ    scratch, .label1
17166     .label2:
17167         CMP     rval, 0.  */
17168   bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
17169                         oldval == const0_rtx && mode != TImode);
17170
17171   label1 = NULL;
17172   if (!is_weak)
17173     {
17174       label1 = gen_label_rtx ();
17175       emit_label (label1);
17176     }
17177   label2 = gen_label_rtx ();
17178
17179   /* The initial load can be relaxed for a __sync operation since a final
17180      barrier will be emitted to stop code hoisting.  */
17181   if (is_mm_sync (model))
17182     aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
17183   else
17184     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
17185
17186   if (strong_zero_p)
17187     x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
17188   else
17189     {
17190       rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
17191       x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
17192     }
17193   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17194                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
17195   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17196
17197   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
17198
17199   if (!is_weak)
17200     {
17201       if (aarch64_track_speculation)
17202         {
17203           /* Emit an explicit compare instruction, so that we can correctly
17204              track the condition codes.  */
17205           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
17206           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
17207         }
17208       else
17209         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
17210
17211       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17212                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
17213       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17214     }
17215   else
17216     aarch64_gen_compare_reg (NE, scratch, const0_rtx);
17217
17218   emit_label (label2);
17219
17220   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
17221      to set the condition flags.  If this is not used it will be removed by
17222      later passes.  */
17223   if (strong_zero_p)
17224     aarch64_gen_compare_reg (NE, rval, const0_rtx);
17225
17226   /* Emit any final barrier needed for a __sync operation.  */
17227   if (is_mm_sync (model))
17228     aarch64_emit_post_barrier (model);
17229 }
17230
17231 /* Split an atomic operation.  */
17232
17233 void
17234 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
17235                          rtx value, rtx model_rtx, rtx cond)
17236 {
17237   machine_mode mode = GET_MODE (mem);
17238   machine_mode wmode = (mode == DImode ? DImode : SImode);
17239   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
17240   const bool is_sync = is_mm_sync (model);
17241   rtx_code_label *label;
17242   rtx x;
17243
17244   /* Split the atomic operation into a sequence.  */
17245   label = gen_label_rtx ();
17246   emit_label (label);
17247
17248   if (new_out)
17249     new_out = gen_lowpart (wmode, new_out);
17250   if (old_out)
17251     old_out = gen_lowpart (wmode, old_out);
17252   else
17253     old_out = new_out;
17254   value = simplify_gen_subreg (wmode, value, mode, 0);
17255
17256   /* The initial load can be relaxed for a __sync operation since a final
17257      barrier will be emitted to stop code hoisting.  */
17258  if (is_sync)
17259     aarch64_emit_load_exclusive (mode, old_out, mem,
17260                                  GEN_INT (MEMMODEL_RELAXED));
17261   else
17262     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
17263
17264   switch (code)
17265     {
17266     case SET:
17267       new_out = value;
17268       break;
17269
17270     case NOT:
17271       x = gen_rtx_AND (wmode, old_out, value);
17272       emit_insn (gen_rtx_SET (new_out, x));
17273       x = gen_rtx_NOT (wmode, new_out);
17274       emit_insn (gen_rtx_SET (new_out, x));
17275       break;
17276
17277     case MINUS:
17278       if (CONST_INT_P (value))
17279         {
17280           value = GEN_INT (-INTVAL (value));
17281           code = PLUS;
17282         }
17283       /* Fall through.  */
17284
17285     default:
17286       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
17287       emit_insn (gen_rtx_SET (new_out, x));
17288       break;
17289     }
17290
17291   aarch64_emit_store_exclusive (mode, cond, mem,
17292                                 gen_lowpart (mode, new_out), model_rtx);
17293
17294   if (aarch64_track_speculation)
17295     {
17296       /* Emit an explicit compare instruction, so that we can correctly
17297          track the condition codes.  */
17298       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
17299       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
17300     }
17301   else
17302     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
17303
17304   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17305                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
17306   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17307
17308   /* Emit any final barrier needed for a __sync operation.  */
17309   if (is_sync)
17310     aarch64_emit_post_barrier (model);
17311 }
17312
17313 static void
17314 aarch64_init_libfuncs (void)
17315 {
17316    /* Half-precision float operations.  The compiler handles all operations
17317      with NULL libfuncs by converting to SFmode.  */
17318
17319   /* Conversions.  */
17320   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
17321   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
17322
17323   /* Arithmetic.  */
17324   set_optab_libfunc (add_optab, HFmode, NULL);
17325   set_optab_libfunc (sdiv_optab, HFmode, NULL);
17326   set_optab_libfunc (smul_optab, HFmode, NULL);
17327   set_optab_libfunc (neg_optab, HFmode, NULL);
17328   set_optab_libfunc (sub_optab, HFmode, NULL);
17329
17330   /* Comparisons.  */
17331   set_optab_libfunc (eq_optab, HFmode, NULL);
17332   set_optab_libfunc (ne_optab, HFmode, NULL);
17333   set_optab_libfunc (lt_optab, HFmode, NULL);
17334   set_optab_libfunc (le_optab, HFmode, NULL);
17335   set_optab_libfunc (ge_optab, HFmode, NULL);
17336   set_optab_libfunc (gt_optab, HFmode, NULL);
17337   set_optab_libfunc (unord_optab, HFmode, NULL);
17338 }
17339
17340 /* Target hook for c_mode_for_suffix.  */
17341 static machine_mode
17342 aarch64_c_mode_for_suffix (char suffix)
17343 {
17344   if (suffix == 'q')
17345     return TFmode;
17346
17347   return VOIDmode;
17348 }
17349
17350 /* We can only represent floating point constants which will fit in
17351    "quarter-precision" values.  These values are characterised by
17352    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
17353    by:
17354
17355    (-1)^s * (n/16) * 2^r
17356
17357    Where:
17358      's' is the sign bit.
17359      'n' is an integer in the range 16 <= n <= 31.
17360      'r' is an integer in the range -3 <= r <= 4.  */
17361
17362 /* Return true iff X can be represented by a quarter-precision
17363    floating point immediate operand X.  Note, we cannot represent 0.0.  */
17364 bool
17365 aarch64_float_const_representable_p (rtx x)
17366 {
17367   /* This represents our current view of how many bits
17368      make up the mantissa.  */
17369   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
17370   int exponent;
17371   unsigned HOST_WIDE_INT mantissa, mask;
17372   REAL_VALUE_TYPE r, m;
17373   bool fail;
17374
17375   x = unwrap_const_vec_duplicate (x);
17376   if (!CONST_DOUBLE_P (x))
17377     return false;
17378
17379   if (GET_MODE (x) == VOIDmode
17380       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
17381     return false;
17382
17383   r = *CONST_DOUBLE_REAL_VALUE (x);
17384
17385   /* We cannot represent infinities, NaNs or +/-zero.  We won't
17386      know if we have +zero until we analyse the mantissa, but we
17387      can reject the other invalid values.  */
17388   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
17389       || REAL_VALUE_MINUS_ZERO (r))
17390     return false;
17391
17392   /* Extract exponent.  */
17393   r = real_value_abs (&r);
17394   exponent = REAL_EXP (&r);
17395
17396   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
17397      highest (sign) bit, with a fixed binary point at bit point_pos.
17398      m1 holds the low part of the mantissa, m2 the high part.
17399      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
17400      bits for the mantissa, this can fail (low bits will be lost).  */
17401   real_ldexp (&m, &r, point_pos - exponent);
17402   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
17403
17404   /* If the low part of the mantissa has bits set we cannot represent
17405      the value.  */
17406   if (w.ulow () != 0)
17407     return false;
17408   /* We have rejected the lower HOST_WIDE_INT, so update our
17409      understanding of how many bits lie in the mantissa and
17410      look only at the high HOST_WIDE_INT.  */
17411   mantissa = w.elt (1);
17412   point_pos -= HOST_BITS_PER_WIDE_INT;
17413
17414   /* We can only represent values with a mantissa of the form 1.xxxx.  */
17415   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
17416   if ((mantissa & mask) != 0)
17417     return false;
17418
17419   /* Having filtered unrepresentable values, we may now remove all
17420      but the highest 5 bits.  */
17421   mantissa >>= point_pos - 5;
17422
17423   /* We cannot represent the value 0.0, so reject it.  This is handled
17424      elsewhere.  */
17425   if (mantissa == 0)
17426     return false;
17427
17428   /* Then, as bit 4 is always set, we can mask it off, leaving
17429      the mantissa in the range [0, 15].  */
17430   mantissa &= ~(1 << 4);
17431   gcc_assert (mantissa <= 15);
17432
17433   /* GCC internally does not use IEEE754-like encoding (where normalized
17434      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
17435      Our mantissa values are shifted 4 places to the left relative to
17436      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
17437      by 5 places to correct for GCC's representation.  */
17438   exponent = 5 - exponent;
17439
17440   return (exponent >= 0 && exponent <= 7);
17441 }
17442
17443 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
17444    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
17445    output MOVI/MVNI, ORR or BIC immediate.  */
17446 char*
17447 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
17448                                    enum simd_immediate_check which)
17449 {
17450   bool is_valid;
17451   static char templ[40];
17452   const char *mnemonic;
17453   const char *shift_op;
17454   unsigned int lane_count = 0;
17455   char element_char;
17456
17457   struct simd_immediate_info info;
17458
17459   /* This will return true to show const_vector is legal for use as either
17460      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
17461      It will also update INFO to show how the immediate should be generated.
17462      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
17463   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
17464   gcc_assert (is_valid);
17465
17466   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
17467   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
17468
17469   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
17470     {
17471       gcc_assert (info.insn == simd_immediate_info::MOV
17472                   && info.u.mov.shift == 0);
17473       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
17474          move immediate path.  */
17475       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
17476         info.u.mov.value = GEN_INT (0);
17477       else
17478         {
17479           const unsigned int buf_size = 20;
17480           char float_buf[buf_size] = {'\0'};
17481           real_to_decimal_for_mode (float_buf,
17482                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17483                                     buf_size, buf_size, 1, info.elt_mode);
17484
17485           if (lane_count == 1)
17486             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
17487           else
17488             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
17489                       lane_count, element_char, float_buf);
17490           return templ;
17491         }
17492     }
17493
17494   gcc_assert (CONST_INT_P (info.u.mov.value));
17495
17496   if (which == AARCH64_CHECK_MOV)
17497     {
17498       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
17499       shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
17500                   ? "msl" : "lsl");
17501       if (lane_count == 1)
17502         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
17503                   mnemonic, UINTVAL (info.u.mov.value));
17504       else if (info.u.mov.shift)
17505         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
17506                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
17507                   element_char, UINTVAL (info.u.mov.value), shift_op,
17508                   info.u.mov.shift);
17509       else
17510         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
17511                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
17512                   element_char, UINTVAL (info.u.mov.value));
17513     }
17514   else
17515     {
17516       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
17517       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
17518       if (info.u.mov.shift)
17519         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
17520                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
17521                   element_char, UINTVAL (info.u.mov.value), "lsl",
17522                   info.u.mov.shift);
17523       else
17524         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
17525                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
17526                   element_char, UINTVAL (info.u.mov.value));
17527     }
17528   return templ;
17529 }
17530
17531 char*
17532 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
17533 {
17534
17535   /* If a floating point number was passed and we desire to use it in an
17536      integer mode do the conversion to integer.  */
17537   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
17538     {
17539       unsigned HOST_WIDE_INT ival;
17540       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
17541           gcc_unreachable ();
17542       immediate = gen_int_mode (ival, mode);
17543     }
17544
17545   machine_mode vmode;
17546   /* use a 64 bit mode for everything except for DI/DF mode, where we use
17547      a 128 bit vector mode.  */
17548   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
17549
17550   vmode = aarch64_simd_container_mode (mode, width);
17551   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
17552   return aarch64_output_simd_mov_immediate (v_op, width);
17553 }
17554
17555 /* Return the output string to use for moving immediate CONST_VECTOR
17556    into an SVE register.  */
17557
17558 char *
17559 aarch64_output_sve_mov_immediate (rtx const_vector)
17560 {
17561   static char templ[40];
17562   struct simd_immediate_info info;
17563   char element_char;
17564
17565   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
17566   gcc_assert (is_valid);
17567
17568   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
17569
17570   machine_mode vec_mode = GET_MODE (const_vector);
17571   if (aarch64_sve_pred_mode_p (vec_mode))
17572     {
17573       static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
17574       if (info.insn == simd_immediate_info::MOV)
17575         {
17576           gcc_assert (info.u.mov.value == const0_rtx);
17577           snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
17578         }
17579       else
17580         {
17581           gcc_assert (info.insn == simd_immediate_info::PTRUE);
17582           unsigned int total_bytes;
17583           if (info.u.pattern == AARCH64_SV_ALL
17584               && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
17585             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
17586                       total_bytes / GET_MODE_SIZE (info.elt_mode));
17587           else
17588             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
17589                       svpattern_token (info.u.pattern));
17590         }
17591       return buf;
17592     }
17593
17594   if (info.insn == simd_immediate_info::INDEX)
17595     {
17596       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
17597                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
17598                 element_char, INTVAL (info.u.index.base),
17599                 INTVAL (info.u.index.step));
17600       return templ;
17601     }
17602
17603   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
17604     {
17605       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
17606         info.u.mov.value = GEN_INT (0);
17607       else
17608         {
17609           const int buf_size = 20;
17610           char float_buf[buf_size] = {};
17611           real_to_decimal_for_mode (float_buf,
17612                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17613                                     buf_size, buf_size, 1, info.elt_mode);
17614
17615           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
17616                     element_char, float_buf);
17617           return templ;
17618         }
17619     }
17620
17621   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
17622             element_char, INTVAL (info.u.mov.value));
17623   return templ;
17624 }
17625
17626 /* Split operands into moves from op[1] + op[2] into op[0].  */
17627
17628 void
17629 aarch64_split_combinev16qi (rtx operands[3])
17630 {
17631   unsigned int dest = REGNO (operands[0]);
17632   unsigned int src1 = REGNO (operands[1]);
17633   unsigned int src2 = REGNO (operands[2]);
17634   machine_mode halfmode = GET_MODE (operands[1]);
17635   unsigned int halfregs = REG_NREGS (operands[1]);
17636   rtx destlo, desthi;
17637
17638   gcc_assert (halfmode == V16QImode);
17639
17640   if (src1 == dest && src2 == dest + halfregs)
17641     {
17642       /* No-op move.  Can't split to nothing; emit something.  */
17643       emit_note (NOTE_INSN_DELETED);
17644       return;
17645     }
17646
17647   /* Preserve register attributes for variable tracking.  */
17648   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
17649   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
17650                                GET_MODE_SIZE (halfmode));
17651
17652   /* Special case of reversed high/low parts.  */
17653   if (reg_overlap_mentioned_p (operands[2], destlo)
17654       && reg_overlap_mentioned_p (operands[1], desthi))
17655     {
17656       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17657       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
17658       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17659     }
17660   else if (!reg_overlap_mentioned_p (operands[2], destlo))
17661     {
17662       /* Try to avoid unnecessary moves if part of the result
17663          is in the right place already.  */
17664       if (src1 != dest)
17665         emit_move_insn (destlo, operands[1]);
17666       if (src2 != dest + halfregs)
17667         emit_move_insn (desthi, operands[2]);
17668     }
17669   else
17670     {
17671       if (src2 != dest + halfregs)
17672         emit_move_insn (desthi, operands[2]);
17673       if (src1 != dest)
17674         emit_move_insn (destlo, operands[1]);
17675     }
17676 }
17677
17678 /* vec_perm support.  */
17679
17680 struct expand_vec_perm_d
17681 {
17682   rtx target, op0, op1;
17683   vec_perm_indices perm;
17684   machine_mode vmode;
17685   unsigned int vec_flags;
17686   bool one_vector_p;
17687   bool testing_p;
17688 };
17689
17690 /* Generate a variable permutation.  */
17691
17692 static void
17693 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
17694 {
17695   machine_mode vmode = GET_MODE (target);
17696   bool one_vector_p = rtx_equal_p (op0, op1);
17697
17698   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
17699   gcc_checking_assert (GET_MODE (op0) == vmode);
17700   gcc_checking_assert (GET_MODE (op1) == vmode);
17701   gcc_checking_assert (GET_MODE (sel) == vmode);
17702   gcc_checking_assert (TARGET_SIMD);
17703
17704   if (one_vector_p)
17705     {
17706       if (vmode == V8QImode)
17707         {
17708           /* Expand the argument to a V16QI mode by duplicating it.  */
17709           rtx pair = gen_reg_rtx (V16QImode);
17710           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
17711           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17712         }
17713       else
17714         {
17715           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
17716         }
17717     }
17718   else
17719     {
17720       rtx pair;
17721
17722       if (vmode == V8QImode)
17723         {
17724           pair = gen_reg_rtx (V16QImode);
17725           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
17726           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17727         }
17728       else
17729         {
17730           pair = gen_reg_rtx (OImode);
17731           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
17732           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
17733         }
17734     }
17735 }
17736
17737 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
17738    NELT is the number of elements in the vector.  */
17739
17740 void
17741 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
17742                          unsigned int nelt)
17743 {
17744   machine_mode vmode = GET_MODE (target);
17745   bool one_vector_p = rtx_equal_p (op0, op1);
17746   rtx mask;
17747
17748   /* The TBL instruction does not use a modulo index, so we must take care
17749      of that ourselves.  */
17750   mask = aarch64_simd_gen_const_vector_dup (vmode,
17751       one_vector_p ? nelt - 1 : 2 * nelt - 1);
17752   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
17753
17754   /* For big-endian, we also need to reverse the index within the vector
17755      (but not which vector).  */
17756   if (BYTES_BIG_ENDIAN)
17757     {
17758       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
17759       if (!one_vector_p)
17760         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
17761       sel = expand_simple_binop (vmode, XOR, sel, mask,
17762                                  NULL, 0, OPTAB_LIB_WIDEN);
17763     }
17764   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
17765 }
17766
17767 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
17768
17769 static void
17770 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
17771 {
17772   emit_insn (gen_rtx_SET (target,
17773                           gen_rtx_UNSPEC (GET_MODE (target),
17774                                           gen_rtvec (2, op0, op1), code)));
17775 }
17776
17777 /* Expand an SVE vec_perm with the given operands.  */
17778
17779 void
17780 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
17781 {
17782   machine_mode data_mode = GET_MODE (target);
17783   machine_mode sel_mode = GET_MODE (sel);
17784   /* Enforced by the pattern condition.  */
17785   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
17786
17787   /* Note: vec_perm indices are supposed to wrap when they go beyond the
17788      size of the two value vectors, i.e. the upper bits of the indices
17789      are effectively ignored.  SVE TBL instead produces 0 for any
17790      out-of-range indices, so we need to modulo all the vec_perm indices
17791      to ensure they are all in range.  */
17792   rtx sel_reg = force_reg (sel_mode, sel);
17793
17794   /* Check if the sel only references the first values vector.  */
17795   if (GET_CODE (sel) == CONST_VECTOR
17796       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
17797     {
17798       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
17799       return;
17800     }
17801
17802   /* Check if the two values vectors are the same.  */
17803   if (rtx_equal_p (op0, op1))
17804     {
17805       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
17806       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17807                                          NULL, 0, OPTAB_DIRECT);
17808       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
17809       return;
17810     }
17811
17812   /* Run TBL on for each value vector and combine the results.  */
17813
17814   rtx res0 = gen_reg_rtx (data_mode);
17815   rtx res1 = gen_reg_rtx (data_mode);
17816   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
17817   if (GET_CODE (sel) != CONST_VECTOR
17818       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
17819     {
17820       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
17821                                                        2 * nunits - 1);
17822       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17823                                      NULL, 0, OPTAB_DIRECT);
17824     }
17825   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
17826   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
17827                                      NULL, 0, OPTAB_DIRECT);
17828   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
17829   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
17830     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
17831   else
17832     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
17833 }
17834
17835 /* Recognize patterns suitable for the TRN instructions.  */
17836 static bool
17837 aarch64_evpc_trn (struct expand_vec_perm_d *d)
17838 {
17839   HOST_WIDE_INT odd;
17840   poly_uint64 nelt = d->perm.length ();
17841   rtx out, in0, in1, x;
17842   machine_mode vmode = d->vmode;
17843
17844   if (GET_MODE_UNIT_SIZE (vmode) > 8)
17845     return false;
17846
17847   /* Note that these are little-endian tests.
17848      We correct for big-endian later.  */
17849   if (!d->perm[0].is_constant (&odd)
17850       || (odd != 0 && odd != 1)
17851       || !d->perm.series_p (0, 2, odd, 2)
17852       || !d->perm.series_p (1, 2, nelt + odd, 2))
17853     return false;
17854
17855   /* Success!  */
17856   if (d->testing_p)
17857     return true;
17858
17859   in0 = d->op0;
17860   in1 = d->op1;
17861   /* We don't need a big-endian lane correction for SVE; see the comment
17862      at the head of aarch64-sve.md for details.  */
17863   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17864     {
17865       x = in0, in0 = in1, in1 = x;
17866       odd = !odd;
17867     }
17868   out = d->target;
17869
17870   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17871                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
17872   return true;
17873 }
17874
17875 /* Recognize patterns suitable for the UZP instructions.  */
17876 static bool
17877 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
17878 {
17879   HOST_WIDE_INT odd;
17880   rtx out, in0, in1, x;
17881   machine_mode vmode = d->vmode;
17882
17883   if (GET_MODE_UNIT_SIZE (vmode) > 8)
17884     return false;
17885
17886   /* Note that these are little-endian tests.
17887      We correct for big-endian later.  */
17888   if (!d->perm[0].is_constant (&odd)
17889       || (odd != 0 && odd != 1)
17890       || !d->perm.series_p (0, 1, odd, 2))
17891     return false;
17892
17893   /* Success!  */
17894   if (d->testing_p)
17895     return true;
17896
17897   in0 = d->op0;
17898   in1 = d->op1;
17899   /* We don't need a big-endian lane correction for SVE; see the comment
17900      at the head of aarch64-sve.md for details.  */
17901   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17902     {
17903       x = in0, in0 = in1, in1 = x;
17904       odd = !odd;
17905     }
17906   out = d->target;
17907
17908   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17909                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
17910   return true;
17911 }
17912
17913 /* Recognize patterns suitable for the ZIP instructions.  */
17914 static bool
17915 aarch64_evpc_zip (struct expand_vec_perm_d *d)
17916 {
17917   unsigned int high;
17918   poly_uint64 nelt = d->perm.length ();
17919   rtx out, in0, in1, x;
17920   machine_mode vmode = d->vmode;
17921
17922   if (GET_MODE_UNIT_SIZE (vmode) > 8)
17923     return false;
17924
17925   /* Note that these are little-endian tests.
17926      We correct for big-endian later.  */
17927   poly_uint64 first = d->perm[0];
17928   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
17929       || !d->perm.series_p (0, 2, first, 1)
17930       || !d->perm.series_p (1, 2, first + nelt, 1))
17931     return false;
17932   high = maybe_ne (first, 0U);
17933
17934   /* Success!  */
17935   if (d->testing_p)
17936     return true;
17937
17938   in0 = d->op0;
17939   in1 = d->op1;
17940   /* We don't need a big-endian lane correction for SVE; see the comment
17941      at the head of aarch64-sve.md for details.  */
17942   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17943     {
17944       x = in0, in0 = in1, in1 = x;
17945       high = !high;
17946     }
17947   out = d->target;
17948
17949   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17950                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
17951   return true;
17952 }
17953
17954 /* Recognize patterns for the EXT insn.  */
17955
17956 static bool
17957 aarch64_evpc_ext (struct expand_vec_perm_d *d)
17958 {
17959   HOST_WIDE_INT location;
17960   rtx offset;
17961
17962   /* The first element always refers to the first vector.
17963      Check if the extracted indices are increasing by one.  */
17964   if (d->vec_flags == VEC_SVE_PRED
17965       || !d->perm[0].is_constant (&location)
17966       || !d->perm.series_p (0, 1, location, 1))
17967     return false;
17968
17969   /* Success! */
17970   if (d->testing_p)
17971     return true;
17972
17973   /* The case where (location == 0) is a no-op for both big- and little-endian,
17974      and is removed by the mid-end at optimization levels -O1 and higher.
17975
17976      We don't need a big-endian lane correction for SVE; see the comment
17977      at the head of aarch64-sve.md for details.  */
17978   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
17979     {
17980       /* After setup, we want the high elements of the first vector (stored
17981          at the LSB end of the register), and the low elements of the second
17982          vector (stored at the MSB end of the register). So swap.  */
17983       std::swap (d->op0, d->op1);
17984       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
17985          to_constant () is safe since this is restricted to Advanced SIMD
17986          vectors.  */
17987       location = d->perm.length ().to_constant () - location;
17988     }
17989
17990   offset = GEN_INT (location);
17991   emit_set_insn (d->target,
17992                  gen_rtx_UNSPEC (d->vmode,
17993                                  gen_rtvec (3, d->op0, d->op1, offset),
17994                                  UNSPEC_EXT));
17995   return true;
17996 }
17997
17998 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
17999    within each 64-bit, 32-bit or 16-bit granule.  */
18000
18001 static bool
18002 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
18003 {
18004   HOST_WIDE_INT diff;
18005   unsigned int i, size, unspec;
18006   machine_mode pred_mode;
18007
18008   if (d->vec_flags == VEC_SVE_PRED
18009       || !d->one_vector_p
18010       || !d->perm[0].is_constant (&diff))
18011     return false;
18012
18013   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
18014   if (size == 8)
18015     {
18016       unspec = UNSPEC_REV64;
18017       pred_mode = VNx2BImode;
18018     }
18019   else if (size == 4)
18020     {
18021       unspec = UNSPEC_REV32;
18022       pred_mode = VNx4BImode;
18023     }
18024   else if (size == 2)
18025     {
18026       unspec = UNSPEC_REV16;
18027       pred_mode = VNx8BImode;
18028     }
18029   else
18030     return false;
18031
18032   unsigned int step = diff + 1;
18033   for (i = 0; i < step; ++i)
18034     if (!d->perm.series_p (i, step, diff - i, step))
18035       return false;
18036
18037   /* Success! */
18038   if (d->testing_p)
18039     return true;
18040
18041   if (d->vec_flags == VEC_SVE_DATA)
18042     {
18043       machine_mode int_mode = aarch64_sve_int_mode (pred_mode);
18044       rtx target = gen_reg_rtx (int_mode);
18045       if (BYTES_BIG_ENDIAN)
18046         /* The act of taking a subreg between INT_MODE and d->vmode
18047            is itself a reversing operation on big-endian targets;
18048            see the comment at the head of aarch64-sve.md for details.
18049            First reinterpret OP0 as INT_MODE without using a subreg
18050            and without changing the contents.  */
18051         emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0));
18052       else
18053         {
18054           /* For SVE we use REV[BHW] unspecs derived from the element size
18055              of v->mode and vector modes whose elements have SIZE bytes.
18056              This ensures that the vector modes match the predicate modes.  */
18057           int unspec = aarch64_sve_rev_unspec (d->vmode);
18058           rtx pred = aarch64_ptrue_reg (pred_mode);
18059           emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred,
18060                                        gen_lowpart (int_mode, d->op0)));
18061         }
18062       emit_move_insn (d->target, gen_lowpart (d->vmode, target));
18063       return true;
18064     }
18065   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
18066   emit_set_insn (d->target, src);
18067   return true;
18068 }
18069
18070 /* Recognize patterns for the REV insn, which reverses elements within
18071    a full vector.  */
18072
18073 static bool
18074 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
18075 {
18076   poly_uint64 nelt = d->perm.length ();
18077
18078   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
18079     return false;
18080
18081   if (!d->perm.series_p (0, 1, nelt - 1, -1))
18082     return false;
18083
18084   /* Success! */
18085   if (d->testing_p)
18086     return true;
18087
18088   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
18089   emit_set_insn (d->target, src);
18090   return true;
18091 }
18092
18093 static bool
18094 aarch64_evpc_dup (struct expand_vec_perm_d *d)
18095 {
18096   rtx out = d->target;
18097   rtx in0;
18098   HOST_WIDE_INT elt;
18099   machine_mode vmode = d->vmode;
18100   rtx lane;
18101
18102   if (d->vec_flags == VEC_SVE_PRED
18103       || d->perm.encoding ().encoded_nelts () != 1
18104       || !d->perm[0].is_constant (&elt))
18105     return false;
18106
18107   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
18108     return false;
18109
18110   /* Success! */
18111   if (d->testing_p)
18112     return true;
18113
18114   /* The generic preparation in aarch64_expand_vec_perm_const_1
18115      swaps the operand order and the permute indices if it finds
18116      d->perm[0] to be in the second operand.  Thus, we can always
18117      use d->op0 and need not do any extra arithmetic to get the
18118      correct lane number.  */
18119   in0 = d->op0;
18120   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
18121
18122   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
18123   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
18124   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
18125   return true;
18126 }
18127
18128 static bool
18129 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
18130 {
18131   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
18132   machine_mode vmode = d->vmode;
18133
18134   /* Make sure that the indices are constant.  */
18135   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
18136   for (unsigned int i = 0; i < encoded_nelts; ++i)
18137     if (!d->perm[i].is_constant ())
18138       return false;
18139
18140   if (d->testing_p)
18141     return true;
18142
18143   /* Generic code will try constant permutation twice.  Once with the
18144      original mode and again with the elements lowered to QImode.
18145      So wait and don't do the selector expansion ourselves.  */
18146   if (vmode != V8QImode && vmode != V16QImode)
18147     return false;
18148
18149   /* to_constant is safe since this routine is specific to Advanced SIMD
18150      vectors.  */
18151   unsigned int nelt = d->perm.length ().to_constant ();
18152   for (unsigned int i = 0; i < nelt; ++i)
18153     /* If big-endian and two vectors we end up with a weird mixed-endian
18154        mode on NEON.  Reverse the index within each word but not the word
18155        itself.  to_constant is safe because we checked is_constant above.  */
18156     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
18157                         ? d->perm[i].to_constant () ^ (nelt - 1)
18158                         : d->perm[i].to_constant ());
18159
18160   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
18161   sel = force_reg (vmode, sel);
18162
18163   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
18164   return true;
18165 }
18166
18167 /* Try to implement D using an SVE TBL instruction.  */
18168
18169 static bool
18170 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
18171 {
18172   unsigned HOST_WIDE_INT nelt;
18173
18174   /* Permuting two variable-length vectors could overflow the
18175      index range.  */
18176   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
18177     return false;
18178
18179   if (d->testing_p)
18180     return true;
18181
18182   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
18183   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
18184   if (d->one_vector_p)
18185     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
18186   else
18187     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
18188   return true;
18189 }
18190
18191 /* Try to implement D using SVE SEL instruction.  */
18192
18193 static bool
18194 aarch64_evpc_sel (struct expand_vec_perm_d *d)
18195 {
18196   machine_mode vmode = d->vmode;
18197   int unit_size = GET_MODE_UNIT_SIZE (vmode);
18198
18199   if (d->vec_flags != VEC_SVE_DATA
18200       || unit_size > 8)
18201     return false;
18202
18203   int n_patterns = d->perm.encoding ().npatterns ();
18204   poly_int64 vec_len = d->perm.length ();
18205
18206   for (int i = 0; i < n_patterns; ++i)
18207     if (!known_eq (d->perm[i], i)
18208         && !known_eq (d->perm[i], vec_len + i))
18209       return false;
18210
18211   for (int i = n_patterns; i < n_patterns * 2; i++)
18212     if (!d->perm.series_p (i, n_patterns, i, n_patterns)
18213         && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
18214       return false;
18215
18216   if (d->testing_p)
18217     return true;
18218
18219   machine_mode pred_mode = aarch64_sve_pred_mode (unit_size).require ();
18220
18221   rtx_vector_builder builder (pred_mode, n_patterns, 2);
18222   for (int i = 0; i < n_patterns * 2; i++)
18223     {
18224       rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
18225                                           : CONST0_RTX (BImode);
18226       builder.quick_push (elem);
18227     }
18228
18229   rtx const_vec = builder.build ();
18230   rtx pred = force_reg (pred_mode, const_vec);
18231   emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op1, d->op0, pred));
18232   return true;
18233 }
18234
18235 static bool
18236 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
18237 {
18238   /* The pattern matching functions above are written to look for a small
18239      number to begin the sequence (0, 1, N/2).  If we begin with an index
18240      from the second operand, we can swap the operands.  */
18241   poly_int64 nelt = d->perm.length ();
18242   if (known_ge (d->perm[0], nelt))
18243     {
18244       d->perm.rotate_inputs (1);
18245       std::swap (d->op0, d->op1);
18246     }
18247
18248   if ((d->vec_flags == VEC_ADVSIMD
18249        || d->vec_flags == VEC_SVE_DATA
18250        || d->vec_flags == VEC_SVE_PRED)
18251       && known_gt (nelt, 1))
18252     {
18253       if (aarch64_evpc_rev_local (d))
18254         return true;
18255       else if (aarch64_evpc_rev_global (d))
18256         return true;
18257       else if (aarch64_evpc_ext (d))
18258         return true;
18259       else if (aarch64_evpc_dup (d))
18260         return true;
18261       else if (aarch64_evpc_zip (d))
18262         return true;
18263       else if (aarch64_evpc_uzp (d))
18264         return true;
18265       else if (aarch64_evpc_trn (d))
18266         return true;
18267       else if (aarch64_evpc_sel (d))
18268         return true;
18269       if (d->vec_flags == VEC_SVE_DATA)
18270         return aarch64_evpc_sve_tbl (d);
18271       else if (d->vec_flags == VEC_ADVSIMD)
18272         return aarch64_evpc_tbl (d);
18273     }
18274   return false;
18275 }
18276
18277 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
18278
18279 static bool
18280 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
18281                                   rtx op1, const vec_perm_indices &sel)
18282 {
18283   struct expand_vec_perm_d d;
18284
18285   /* Check whether the mask can be applied to a single vector.  */
18286   if (sel.ninputs () == 1
18287       || (op0 && rtx_equal_p (op0, op1)))
18288     d.one_vector_p = true;
18289   else if (sel.all_from_input_p (0))
18290     {
18291       d.one_vector_p = true;
18292       op1 = op0;
18293     }
18294   else if (sel.all_from_input_p (1))
18295     {
18296       d.one_vector_p = true;
18297       op0 = op1;
18298     }
18299   else
18300     d.one_vector_p = false;
18301
18302   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
18303                      sel.nelts_per_input ());
18304   d.vmode = vmode;
18305   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
18306   d.target = target;
18307   d.op0 = op0;
18308   d.op1 = op1;
18309   d.testing_p = !target;
18310
18311   if (!d.testing_p)
18312     return aarch64_expand_vec_perm_const_1 (&d);
18313
18314   rtx_insn *last = get_last_insn ();
18315   bool ret = aarch64_expand_vec_perm_const_1 (&d);
18316   gcc_assert (last == get_last_insn ());
18317
18318   return ret;
18319 }
18320
18321 /* Generate a byte permute mask for a register of mode MODE,
18322    which has NUNITS units.  */
18323
18324 rtx
18325 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
18326 {
18327   /* We have to reverse each vector because we dont have
18328      a permuted load that can reverse-load according to ABI rules.  */
18329   rtx mask;
18330   rtvec v = rtvec_alloc (16);
18331   unsigned int i, j;
18332   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
18333
18334   gcc_assert (BYTES_BIG_ENDIAN);
18335   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
18336
18337   for (i = 0; i < nunits; i++)
18338     for (j = 0; j < usize; j++)
18339       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
18340   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
18341   return force_reg (V16QImode, mask);
18342 }
18343
18344 /* Expand an SVE integer comparison using the SVE equivalent of:
18345
18346      (set TARGET (CODE OP0 OP1)).  */
18347
18348 void
18349 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
18350 {
18351   machine_mode pred_mode = GET_MODE (target);
18352   machine_mode data_mode = GET_MODE (op0);
18353   rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
18354                                       op0, op1);
18355   if (!rtx_equal_p (target, res))
18356     emit_move_insn (target, res);
18357 }
18358
18359 /* Return the UNSPEC_COND_* code for comparison CODE.  */
18360
18361 static unsigned int
18362 aarch64_unspec_cond_code (rtx_code code)
18363 {
18364   switch (code)
18365     {
18366     case NE:
18367       return UNSPEC_COND_FCMNE;
18368     case EQ:
18369       return UNSPEC_COND_FCMEQ;
18370     case LT:
18371       return UNSPEC_COND_FCMLT;
18372     case GT:
18373       return UNSPEC_COND_FCMGT;
18374     case LE:
18375       return UNSPEC_COND_FCMLE;
18376     case GE:
18377       return UNSPEC_COND_FCMGE;
18378     case UNORDERED:
18379       return UNSPEC_COND_FCMUO;
18380     default:
18381       gcc_unreachable ();
18382     }
18383 }
18384
18385 /* Emit:
18386
18387       (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
18388
18389    where <X> is the operation associated with comparison CODE.
18390    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
18391
18392 static void
18393 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
18394                           bool known_ptrue_p, rtx op0, rtx op1)
18395 {
18396   rtx flag = gen_int_mode (known_ptrue_p, SImode);
18397   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
18398                                gen_rtvec (4, pred, flag, op0, op1),
18399                                aarch64_unspec_cond_code (code));
18400   emit_set_insn (target, unspec);
18401 }
18402
18403 /* Emit the SVE equivalent of:
18404
18405       (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
18406       (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
18407       (set TARGET (ior:PRED_MODE TMP1 TMP2))
18408
18409    where <Xi> is the operation associated with comparison CODEi.
18410    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
18411
18412 static void
18413 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
18414                               rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
18415 {
18416   machine_mode pred_mode = GET_MODE (pred);
18417   rtx tmp1 = gen_reg_rtx (pred_mode);
18418   aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
18419   rtx tmp2 = gen_reg_rtx (pred_mode);
18420   aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
18421   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
18422 }
18423
18424 /* Emit the SVE equivalent of:
18425
18426       (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
18427       (set TARGET (not TMP))
18428
18429    where <X> is the operation associated with comparison CODE.
18430    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
18431
18432 static void
18433 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
18434                                  bool known_ptrue_p, rtx op0, rtx op1)
18435 {
18436   machine_mode pred_mode = GET_MODE (pred);
18437   rtx tmp = gen_reg_rtx (pred_mode);
18438   aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
18439   aarch64_emit_unop (target, one_cmpl_optab, tmp);
18440 }
18441
18442 /* Expand an SVE floating-point comparison using the SVE equivalent of:
18443
18444      (set TARGET (CODE OP0 OP1))
18445
18446    If CAN_INVERT_P is true, the caller can also handle inverted results;
18447    return true if the result is in fact inverted.  */
18448
18449 bool
18450 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
18451                                   rtx op0, rtx op1, bool can_invert_p)
18452 {
18453   machine_mode pred_mode = GET_MODE (target);
18454   machine_mode data_mode = GET_MODE (op0);
18455
18456   rtx ptrue = aarch64_ptrue_reg (pred_mode);
18457   switch (code)
18458     {
18459     case UNORDERED:
18460       /* UNORDERED has no immediate form.  */
18461       op1 = force_reg (data_mode, op1);
18462       /* fall through */
18463     case LT:
18464     case LE:
18465     case GT:
18466     case GE:
18467     case EQ:
18468     case NE:
18469       {
18470         /* There is native support for the comparison.  */
18471         aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
18472         return false;
18473       }
18474
18475     case LTGT:
18476       /* This is a trapping operation (LT or GT).  */
18477       aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
18478       return false;
18479
18480     case UNEQ:
18481       if (!flag_trapping_math)
18482         {
18483           /* This would trap for signaling NaNs.  */
18484           op1 = force_reg (data_mode, op1);
18485           aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
18486                                         ptrue, true, op0, op1);
18487           return false;
18488         }
18489       /* fall through */
18490     case UNLT:
18491     case UNLE:
18492     case UNGT:
18493     case UNGE:
18494       if (flag_trapping_math)
18495         {
18496           /* Work out which elements are ordered.  */
18497           rtx ordered = gen_reg_rtx (pred_mode);
18498           op1 = force_reg (data_mode, op1);
18499           aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
18500                                            ptrue, true, op0, op1);
18501
18502           /* Test the opposite condition for the ordered elements,
18503              then invert the result.  */
18504           if (code == UNEQ)
18505             code = NE;
18506           else
18507             code = reverse_condition_maybe_unordered (code);
18508           if (can_invert_p)
18509             {
18510               aarch64_emit_sve_fp_cond (target, code,
18511                                         ordered, false, op0, op1);
18512               return true;
18513             }
18514           aarch64_emit_sve_invert_fp_cond (target, code,
18515                                            ordered, false, op0, op1);
18516           return false;
18517         }
18518       break;
18519
18520     case ORDERED:
18521       /* ORDERED has no immediate form.  */
18522       op1 = force_reg (data_mode, op1);
18523       break;
18524
18525     default:
18526       gcc_unreachable ();
18527     }
18528
18529   /* There is native support for the inverse comparison.  */
18530   code = reverse_condition_maybe_unordered (code);
18531   if (can_invert_p)
18532     {
18533       aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
18534       return true;
18535     }
18536   aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
18537   return false;
18538 }
18539
18540 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
18541    of the data being selected and CMP_MODE is the mode of the values being
18542    compared.  */
18543
18544 void
18545 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
18546                           rtx *ops)
18547 {
18548   machine_mode pred_mode
18549     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
18550                              GET_MODE_SIZE (cmp_mode)).require ();
18551   rtx pred = gen_reg_rtx (pred_mode);
18552   if (FLOAT_MODE_P (cmp_mode))
18553     {
18554       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
18555                                             ops[4], ops[5], true))
18556         std::swap (ops[1], ops[2]);
18557     }
18558   else
18559     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
18560
18561   if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
18562     ops[1] = force_reg (data_mode, ops[1]);
18563   /* The "false" value can only be zero if the "true" value is a constant.  */
18564   if (register_operand (ops[1], data_mode)
18565       || !aarch64_simd_reg_or_zero (ops[2], data_mode))
18566     ops[2] = force_reg (data_mode, ops[2]);
18567
18568   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
18569   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
18570 }
18571
18572 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
18573    true.  However due to issues with register allocation it is preferable
18574    to avoid tieing integer scalar and FP scalar modes.  Executing integer
18575    operations in general registers is better than treating them as scalar
18576    vector operations.  This reduces latency and avoids redundant int<->FP
18577    moves.  So tie modes if they are either the same class, or vector modes
18578    with other vector modes, vector structs or any scalar mode.  */
18579
18580 static bool
18581 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
18582 {
18583   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
18584     return true;
18585
18586   /* We specifically want to allow elements of "structure" modes to
18587      be tieable to the structure.  This more general condition allows
18588      other rarer situations too.  The reason we don't extend this to
18589      predicate modes is that there are no predicate structure modes
18590      nor any specific instructions for extracting part of a predicate
18591      register.  */
18592   if (aarch64_vector_data_mode_p (mode1)
18593       && aarch64_vector_data_mode_p (mode2))
18594     return true;
18595
18596   /* Also allow any scalar modes with vectors.  */
18597   if (aarch64_vector_mode_supported_p (mode1)
18598       || aarch64_vector_mode_supported_p (mode2))
18599     return true;
18600
18601   return false;
18602 }
18603
18604 /* Return a new RTX holding the result of moving POINTER forward by
18605    AMOUNT bytes.  */
18606
18607 static rtx
18608 aarch64_move_pointer (rtx pointer, poly_int64 amount)
18609 {
18610   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
18611
18612   return adjust_automodify_address (pointer, GET_MODE (pointer),
18613                                     next, amount);
18614 }
18615
18616 /* Return a new RTX holding the result of moving POINTER forward by the
18617    size of the mode it points to.  */
18618
18619 static rtx
18620 aarch64_progress_pointer (rtx pointer)
18621 {
18622   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
18623 }
18624
18625 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
18626    MODE bytes.  */
18627
18628 static void
18629 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
18630                                               machine_mode mode)
18631 {
18632   rtx reg = gen_reg_rtx (mode);
18633
18634   /* "Cast" the pointers to the correct mode.  */
18635   *src = adjust_address (*src, mode, 0);
18636   *dst = adjust_address (*dst, mode, 0);
18637   /* Emit the memcpy.  */
18638   emit_move_insn (reg, *src);
18639   emit_move_insn (*dst, reg);
18640   /* Move the pointers forward.  */
18641   *src = aarch64_progress_pointer (*src);
18642   *dst = aarch64_progress_pointer (*dst);
18643 }
18644
18645 /* Expand cpymem, as if from a __builtin_memcpy.  Return true if
18646    we succeed, otherwise return false.  */
18647
18648 bool
18649 aarch64_expand_cpymem (rtx *operands)
18650 {
18651   int n, mode_bits;
18652   rtx dst = operands[0];
18653   rtx src = operands[1];
18654   rtx base;
18655   machine_mode cur_mode = BLKmode, next_mode;
18656   bool speed_p = !optimize_function_for_size_p (cfun);
18657
18658   /* When optimizing for size, give a better estimate of the length of a
18659      memcpy call, but use the default otherwise.  Moves larger than 8 bytes
18660      will always require an even number of instructions to do now.  And each
18661      operation requires both a load+store, so devide the max number by 2.  */
18662   int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
18663
18664   /* We can't do anything smart if the amount to copy is not constant.  */
18665   if (!CONST_INT_P (operands[2]))
18666     return false;
18667
18668   n = INTVAL (operands[2]);
18669
18670   /* Try to keep the number of instructions low.  For all cases we will do at
18671      most two moves for the residual amount, since we'll always overlap the
18672      remainder.  */
18673   if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
18674     return false;
18675
18676   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
18677   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
18678
18679   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
18680   src = adjust_automodify_address (src, VOIDmode, base, 0);
18681
18682   /* Convert n to bits to make the rest of the code simpler.  */
18683   n = n * BITS_PER_UNIT;
18684
18685   /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
18686      larger than TImode, but we should not use them for loads/stores here.  */
18687   const int copy_limit = GET_MODE_BITSIZE (TImode);
18688
18689   while (n > 0)
18690     {
18691       /* Find the largest mode in which to do the copy in without over reading
18692          or writing.  */
18693       opt_scalar_int_mode mode_iter;
18694       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
18695         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
18696           cur_mode = mode_iter.require ();
18697
18698       gcc_assert (cur_mode != BLKmode);
18699
18700       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
18701       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
18702
18703       n -= mode_bits;
18704
18705       /* Do certain trailing copies as overlapping if it's going to be
18706          cheaper.  i.e. less instructions to do so.  For instance doing a 15
18707          byte copy it's more efficient to do two overlapping 8 byte copies than
18708          8 + 6 + 1.  */
18709       if (n > 0 && n <= 8 * BITS_PER_UNIT)
18710         {
18711           next_mode = smallest_mode_for_size (n, MODE_INT);
18712           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
18713           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
18714           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
18715           n = n_bits;
18716         }
18717     }
18718
18719   return true;
18720 }
18721
18722 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
18723    SImode stores.  Handle the case when the constant has identical
18724    bottom and top halves.  This is beneficial when the two stores can be
18725    merged into an STP and we avoid synthesising potentially expensive
18726    immediates twice.  Return true if such a split is possible.  */
18727
18728 bool
18729 aarch64_split_dimode_const_store (rtx dst, rtx src)
18730 {
18731   rtx lo = gen_lowpart (SImode, src);
18732   rtx hi = gen_highpart_mode (SImode, DImode, src);
18733
18734   bool size_p = optimize_function_for_size_p (cfun);
18735
18736   if (!rtx_equal_p (lo, hi))
18737     return false;
18738
18739   unsigned int orig_cost
18740     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
18741   unsigned int lo_cost
18742     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
18743
18744   /* We want to transform:
18745      MOV        x1, 49370
18746      MOVK       x1, 0x140, lsl 16
18747      MOVK       x1, 0xc0da, lsl 32
18748      MOVK       x1, 0x140, lsl 48
18749      STR        x1, [x0]
18750    into:
18751      MOV        w1, 49370
18752      MOVK       w1, 0x140, lsl 16
18753      STP        w1, w1, [x0]
18754    So we want to perform this only when we save two instructions
18755    or more.  When optimizing for size, however, accept any code size
18756    savings we can.  */
18757   if (size_p && orig_cost <= lo_cost)
18758     return false;
18759
18760   if (!size_p
18761       && (orig_cost <= lo_cost + 1))
18762     return false;
18763
18764   rtx mem_lo = adjust_address (dst, SImode, 0);
18765   if (!aarch64_mem_pair_operand (mem_lo, SImode))
18766     return false;
18767
18768   rtx tmp_reg = gen_reg_rtx (SImode);
18769   aarch64_expand_mov_immediate (tmp_reg, lo);
18770   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
18771   /* Don't emit an explicit store pair as this may not be always profitable.
18772      Let the sched-fusion logic decide whether to merge them.  */
18773   emit_move_insn (mem_lo, tmp_reg);
18774   emit_move_insn (mem_hi, tmp_reg);
18775
18776   return true;
18777 }
18778
18779 /* Generate RTL for a conditional branch with rtx comparison CODE in
18780    mode CC_MODE.  The destination of the unlikely conditional branch
18781    is LABEL_REF.  */
18782
18783 void
18784 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
18785                               rtx label_ref)
18786 {
18787   rtx x;
18788   x = gen_rtx_fmt_ee (code, VOIDmode,
18789                       gen_rtx_REG (cc_mode, CC_REGNUM),
18790                       const0_rtx);
18791
18792   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18793                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
18794                             pc_rtx);
18795   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18796 }
18797
18798 /* Generate DImode scratch registers for 128-bit (TImode) addition.
18799
18800    OP1 represents the TImode destination operand 1
18801    OP2 represents the TImode destination operand 2
18802    LOW_DEST represents the low half (DImode) of TImode operand 0
18803    LOW_IN1 represents the low half (DImode) of TImode operand 1
18804    LOW_IN2 represents the low half (DImode) of TImode operand 2
18805    HIGH_DEST represents the high half (DImode) of TImode operand 0
18806    HIGH_IN1 represents the high half (DImode) of TImode operand 1
18807    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
18808
18809 void
18810 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18811                             rtx *low_in1, rtx *low_in2,
18812                             rtx *high_dest, rtx *high_in1,
18813                             rtx *high_in2)
18814 {
18815   *low_dest = gen_reg_rtx (DImode);
18816   *low_in1 = gen_lowpart (DImode, op1);
18817   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18818                                   subreg_lowpart_offset (DImode, TImode));
18819   *high_dest = gen_reg_rtx (DImode);
18820   *high_in1 = gen_highpart (DImode, op1);
18821   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18822                                    subreg_highpart_offset (DImode, TImode));
18823 }
18824
18825 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
18826
18827    This function differs from 'arch64_addti_scratch_regs' in that
18828    OP1 can be an immediate constant (zero). We must call
18829    subreg_highpart_offset with DImode and TImode arguments, otherwise
18830    VOIDmode will be used for the const_int which generates an internal
18831    error from subreg_size_highpart_offset which does not expect a size of zero.
18832
18833    OP1 represents the TImode destination operand 1
18834    OP2 represents the TImode destination operand 2
18835    LOW_DEST represents the low half (DImode) of TImode operand 0
18836    LOW_IN1 represents the low half (DImode) of TImode operand 1
18837    LOW_IN2 represents the low half (DImode) of TImode operand 2
18838    HIGH_DEST represents the high half (DImode) of TImode operand 0
18839    HIGH_IN1 represents the high half (DImode) of TImode operand 1
18840    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
18841
18842
18843 void
18844 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18845                              rtx *low_in1, rtx *low_in2,
18846                              rtx *high_dest, rtx *high_in1,
18847                              rtx *high_in2)
18848 {
18849   *low_dest = gen_reg_rtx (DImode);
18850   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
18851                                   subreg_lowpart_offset (DImode, TImode));
18852
18853   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18854                                   subreg_lowpart_offset (DImode, TImode));
18855   *high_dest = gen_reg_rtx (DImode);
18856
18857   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
18858                                    subreg_highpart_offset (DImode, TImode));
18859   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18860                                    subreg_highpart_offset (DImode, TImode));
18861 }
18862
18863 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
18864
18865    OP0 represents the TImode destination operand 0
18866    LOW_DEST represents the low half (DImode) of TImode operand 0
18867    LOW_IN1 represents the low half (DImode) of TImode operand 1
18868    LOW_IN2 represents the low half (DImode) of TImode operand 2
18869    HIGH_DEST represents the high half (DImode) of TImode operand 0
18870    HIGH_IN1 represents the high half (DImode) of TImode operand 1
18871    HIGH_IN2 represents the high half (DImode) of TImode operand 2
18872    UNSIGNED_P is true if the operation is being performed on unsigned
18873    values.  */
18874 void
18875 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
18876                        rtx low_in2, rtx high_dest, rtx high_in1,
18877                        rtx high_in2, bool unsigned_p)
18878 {
18879   if (low_in2 == const0_rtx)
18880     {
18881       low_dest = low_in1;
18882       high_in2 = force_reg (DImode, high_in2);
18883       if (unsigned_p)
18884         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
18885       else
18886         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
18887     }
18888   else
18889     {
18890       if (CONST_INT_P (low_in2))
18891         {
18892           high_in2 = force_reg (DImode, high_in2);
18893           emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
18894                                               GEN_INT (-INTVAL (low_in2))));
18895         }
18896       else
18897         emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
18898
18899       if (unsigned_p)
18900         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
18901       else
18902         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
18903     }
18904
18905   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
18906   emit_move_insn (gen_highpart (DImode, op0), high_dest);
18907
18908 }
18909
18910 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
18911
18912 static unsigned HOST_WIDE_INT
18913 aarch64_asan_shadow_offset (void)
18914 {
18915   if (TARGET_ILP32)
18916     return (HOST_WIDE_INT_1 << 29);
18917   else
18918     return (HOST_WIDE_INT_1 << 36);
18919 }
18920
18921 static rtx
18922 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
18923                         int code, tree treeop0, tree treeop1)
18924 {
18925   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18926   rtx op0, op1;
18927   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18928   insn_code icode;
18929   struct expand_operand ops[4];
18930
18931   start_sequence ();
18932   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18933
18934   op_mode = GET_MODE (op0);
18935   if (op_mode == VOIDmode)
18936     op_mode = GET_MODE (op1);
18937
18938   switch (op_mode)
18939     {
18940     case E_QImode:
18941     case E_HImode:
18942     case E_SImode:
18943       cmp_mode = SImode;
18944       icode = CODE_FOR_cmpsi;
18945       break;
18946
18947     case E_DImode:
18948       cmp_mode = DImode;
18949       icode = CODE_FOR_cmpdi;
18950       break;
18951
18952     case E_SFmode:
18953       cmp_mode = SFmode;
18954       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18955       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
18956       break;
18957
18958     case E_DFmode:
18959       cmp_mode = DFmode;
18960       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18961       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
18962       break;
18963
18964     default:
18965       end_sequence ();
18966       return NULL_RTX;
18967     }
18968
18969   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
18970   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
18971   if (!op0 || !op1)
18972     {
18973       end_sequence ();
18974       return NULL_RTX;
18975     }
18976   *prep_seq = get_insns ();
18977   end_sequence ();
18978
18979   create_fixed_operand (&ops[0], op0);
18980   create_fixed_operand (&ops[1], op1);
18981
18982   start_sequence ();
18983   if (!maybe_expand_insn (icode, 2, ops))
18984     {
18985       end_sequence ();
18986       return NULL_RTX;
18987     }
18988   *gen_seq = get_insns ();
18989   end_sequence ();
18990
18991   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
18992                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
18993 }
18994
18995 static rtx
18996 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
18997                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
18998 {
18999   rtx op0, op1, target;
19000   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
19001   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
19002   insn_code icode;
19003   struct expand_operand ops[6];
19004   int aarch64_cond;
19005
19006   push_to_sequence (*prep_seq);
19007   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
19008
19009   op_mode = GET_MODE (op0);
19010   if (op_mode == VOIDmode)
19011     op_mode = GET_MODE (op1);
19012
19013   switch (op_mode)
19014     {
19015     case E_QImode:
19016     case E_HImode:
19017     case E_SImode:
19018       cmp_mode = SImode;
19019       icode = CODE_FOR_ccmpsi;
19020       break;
19021
19022     case E_DImode:
19023       cmp_mode = DImode;
19024       icode = CODE_FOR_ccmpdi;
19025       break;
19026
19027     case E_SFmode:
19028       cmp_mode = SFmode;
19029       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
19030       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
19031       break;
19032
19033     case E_DFmode:
19034       cmp_mode = DFmode;
19035       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
19036       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
19037       break;
19038
19039     default:
19040       end_sequence ();
19041       return NULL_RTX;
19042     }
19043
19044   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
19045   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
19046   if (!op0 || !op1)
19047     {
19048       end_sequence ();
19049       return NULL_RTX;
19050     }
19051   *prep_seq = get_insns ();
19052   end_sequence ();
19053
19054   target = gen_rtx_REG (cc_mode, CC_REGNUM);
19055   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
19056
19057   if (bit_code != AND)
19058     {
19059       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
19060                                                 GET_MODE (XEXP (prev, 0))),
19061                              VOIDmode, XEXP (prev, 0), const0_rtx);
19062       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
19063     }
19064
19065   create_fixed_operand (&ops[0], XEXP (prev, 0));
19066   create_fixed_operand (&ops[1], target);
19067   create_fixed_operand (&ops[2], op0);
19068   create_fixed_operand (&ops[3], op1);
19069   create_fixed_operand (&ops[4], prev);
19070   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
19071
19072   push_to_sequence (*gen_seq);
19073   if (!maybe_expand_insn (icode, 6, ops))
19074     {
19075       end_sequence ();
19076       return NULL_RTX;
19077     }
19078
19079   *gen_seq = get_insns ();
19080   end_sequence ();
19081
19082   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
19083 }
19084
19085 #undef TARGET_GEN_CCMP_FIRST
19086 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
19087
19088 #undef TARGET_GEN_CCMP_NEXT
19089 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
19090
19091 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
19092    instruction fusion of some sort.  */
19093
19094 static bool
19095 aarch64_macro_fusion_p (void)
19096 {
19097   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
19098 }
19099
19100
19101 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
19102    should be kept together during scheduling.  */
19103
19104 static bool
19105 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
19106 {
19107   rtx set_dest;
19108   rtx prev_set = single_set (prev);
19109   rtx curr_set = single_set (curr);
19110   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
19111   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
19112
19113   if (!aarch64_macro_fusion_p ())
19114     return false;
19115
19116   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
19117     {
19118       /* We are trying to match:
19119          prev (mov)  == (set (reg r0) (const_int imm16))
19120          curr (movk) == (set (zero_extract (reg r0)
19121                                            (const_int 16)
19122                                            (const_int 16))
19123                              (const_int imm16_1))  */
19124
19125       set_dest = SET_DEST (curr_set);
19126
19127       if (GET_CODE (set_dest) == ZERO_EXTRACT
19128           && CONST_INT_P (SET_SRC (curr_set))
19129           && CONST_INT_P (SET_SRC (prev_set))
19130           && CONST_INT_P (XEXP (set_dest, 2))
19131           && INTVAL (XEXP (set_dest, 2)) == 16
19132           && REG_P (XEXP (set_dest, 0))
19133           && REG_P (SET_DEST (prev_set))
19134           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
19135         {
19136           return true;
19137         }
19138     }
19139
19140   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
19141     {
19142
19143       /*  We're trying to match:
19144           prev (adrp) == (set (reg r1)
19145                               (high (symbol_ref ("SYM"))))
19146           curr (add) == (set (reg r0)
19147                              (lo_sum (reg r1)
19148                                      (symbol_ref ("SYM"))))
19149           Note that r0 need not necessarily be the same as r1, especially
19150           during pre-regalloc scheduling.  */
19151
19152       if (satisfies_constraint_Ush (SET_SRC (prev_set))
19153           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
19154         {
19155           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
19156               && REG_P (XEXP (SET_SRC (curr_set), 0))
19157               && REGNO (XEXP (SET_SRC (curr_set), 0))
19158                  == REGNO (SET_DEST (prev_set))
19159               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
19160                               XEXP (SET_SRC (curr_set), 1)))
19161             return true;
19162         }
19163     }
19164
19165   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
19166     {
19167
19168       /* We're trying to match:
19169          prev (movk) == (set (zero_extract (reg r0)
19170                                            (const_int 16)
19171                                            (const_int 32))
19172                              (const_int imm16_1))
19173          curr (movk) == (set (zero_extract (reg r0)
19174                                            (const_int 16)
19175                                            (const_int 48))
19176                              (const_int imm16_2))  */
19177
19178       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
19179           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
19180           && REG_P (XEXP (SET_DEST (prev_set), 0))
19181           && REG_P (XEXP (SET_DEST (curr_set), 0))
19182           && REGNO (XEXP (SET_DEST (prev_set), 0))
19183              == REGNO (XEXP (SET_DEST (curr_set), 0))
19184           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
19185           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
19186           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
19187           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
19188           && CONST_INT_P (SET_SRC (prev_set))
19189           && CONST_INT_P (SET_SRC (curr_set)))
19190         return true;
19191
19192     }
19193   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
19194     {
19195       /* We're trying to match:
19196           prev (adrp) == (set (reg r0)
19197                               (high (symbol_ref ("SYM"))))
19198           curr (ldr) == (set (reg r1)
19199                              (mem (lo_sum (reg r0)
19200                                              (symbol_ref ("SYM")))))
19201                  or
19202           curr (ldr) == (set (reg r1)
19203                              (zero_extend (mem
19204                                            (lo_sum (reg r0)
19205                                                    (symbol_ref ("SYM"))))))  */
19206       if (satisfies_constraint_Ush (SET_SRC (prev_set))
19207           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
19208         {
19209           rtx curr_src = SET_SRC (curr_set);
19210
19211           if (GET_CODE (curr_src) == ZERO_EXTEND)
19212             curr_src = XEXP (curr_src, 0);
19213
19214           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
19215               && REG_P (XEXP (XEXP (curr_src, 0), 0))
19216               && REGNO (XEXP (XEXP (curr_src, 0), 0))
19217                  == REGNO (SET_DEST (prev_set))
19218               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
19219                               XEXP (SET_SRC (prev_set), 0)))
19220               return true;
19221         }
19222     }
19223
19224   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
19225       && any_condjump_p (curr))
19226     {
19227       unsigned int condreg1, condreg2;
19228       rtx cc_reg_1;
19229       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
19230       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
19231
19232       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
19233           && prev
19234           && modified_in_p (cc_reg_1, prev))
19235         {
19236           enum attr_type prev_type = get_attr_type (prev);
19237
19238           /* FIXME: this misses some which is considered simple arthematic
19239              instructions for ThunderX.  Simple shifts are missed here.  */
19240           if (prev_type == TYPE_ALUS_SREG
19241               || prev_type == TYPE_ALUS_IMM
19242               || prev_type == TYPE_LOGICS_REG
19243               || prev_type == TYPE_LOGICS_IMM)
19244             return true;
19245         }
19246     }
19247
19248   if (prev_set
19249       && curr_set
19250       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
19251       && any_condjump_p (curr))
19252     {
19253       /* We're trying to match:
19254           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
19255           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
19256                                                          (const_int 0))
19257                                                  (label_ref ("SYM"))
19258                                                  (pc))  */
19259       if (SET_DEST (curr_set) == (pc_rtx)
19260           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
19261           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
19262           && REG_P (SET_DEST (prev_set))
19263           && REGNO (SET_DEST (prev_set))
19264              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
19265         {
19266           /* Fuse ALU operations followed by conditional branch instruction.  */
19267           switch (get_attr_type (prev))
19268             {
19269             case TYPE_ALU_IMM:
19270             case TYPE_ALU_SREG:
19271             case TYPE_ADC_REG:
19272             case TYPE_ADC_IMM:
19273             case TYPE_ADCS_REG:
19274             case TYPE_ADCS_IMM:
19275             case TYPE_LOGIC_REG:
19276             case TYPE_LOGIC_IMM:
19277             case TYPE_CSEL:
19278             case TYPE_ADR:
19279             case TYPE_MOV_IMM:
19280             case TYPE_SHIFT_REG:
19281             case TYPE_SHIFT_IMM:
19282             case TYPE_BFM:
19283             case TYPE_RBIT:
19284             case TYPE_REV:
19285             case TYPE_EXTEND:
19286               return true;
19287
19288             default:;
19289             }
19290         }
19291     }
19292
19293   return false;
19294 }
19295
19296 /* Return true iff the instruction fusion described by OP is enabled.  */
19297
19298 bool
19299 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
19300 {
19301   return (aarch64_tune_params.fusible_ops & op) != 0;
19302 }
19303
19304 /* If MEM is in the form of [base+offset], extract the two parts
19305    of address and set to BASE and OFFSET, otherwise return false
19306    after clearing BASE and OFFSET.  */
19307
19308 bool
19309 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
19310 {
19311   rtx addr;
19312
19313   gcc_assert (MEM_P (mem));
19314
19315   addr = XEXP (mem, 0);
19316
19317   if (REG_P (addr))
19318     {
19319       *base = addr;
19320       *offset = const0_rtx;
19321       return true;
19322     }
19323
19324   if (GET_CODE (addr) == PLUS
19325       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
19326     {
19327       *base = XEXP (addr, 0);
19328       *offset = XEXP (addr, 1);
19329       return true;
19330     }
19331
19332   *base = NULL_RTX;
19333   *offset = NULL_RTX;
19334
19335   return false;
19336 }
19337
19338 /* Types for scheduling fusion.  */
19339 enum sched_fusion_type
19340 {
19341   SCHED_FUSION_NONE = 0,
19342   SCHED_FUSION_LD_SIGN_EXTEND,
19343   SCHED_FUSION_LD_ZERO_EXTEND,
19344   SCHED_FUSION_LD,
19345   SCHED_FUSION_ST,
19346   SCHED_FUSION_NUM
19347 };
19348
19349 /* If INSN is a load or store of address in the form of [base+offset],
19350    extract the two parts and set to BASE and OFFSET.  Return scheduling
19351    fusion type this INSN is.  */
19352
19353 static enum sched_fusion_type
19354 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
19355 {
19356   rtx x, dest, src;
19357   enum sched_fusion_type fusion = SCHED_FUSION_LD;
19358
19359   gcc_assert (INSN_P (insn));
19360   x = PATTERN (insn);
19361   if (GET_CODE (x) != SET)
19362     return SCHED_FUSION_NONE;
19363
19364   src = SET_SRC (x);
19365   dest = SET_DEST (x);
19366
19367   machine_mode dest_mode = GET_MODE (dest);
19368
19369   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
19370     return SCHED_FUSION_NONE;
19371
19372   if (GET_CODE (src) == SIGN_EXTEND)
19373     {
19374       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
19375       src = XEXP (src, 0);
19376       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
19377         return SCHED_FUSION_NONE;
19378     }
19379   else if (GET_CODE (src) == ZERO_EXTEND)
19380     {
19381       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
19382       src = XEXP (src, 0);
19383       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
19384         return SCHED_FUSION_NONE;
19385     }
19386
19387   if (GET_CODE (src) == MEM && REG_P (dest))
19388     extract_base_offset_in_addr (src, base, offset);
19389   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
19390     {
19391       fusion = SCHED_FUSION_ST;
19392       extract_base_offset_in_addr (dest, base, offset);
19393     }
19394   else
19395     return SCHED_FUSION_NONE;
19396
19397   if (*base == NULL_RTX || *offset == NULL_RTX)
19398     fusion = SCHED_FUSION_NONE;
19399
19400   return fusion;
19401 }
19402
19403 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
19404
19405    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
19406    and PRI are only calculated for these instructions.  For other instruction,
19407    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
19408    type instruction fusion can be added by returning different priorities.
19409
19410    It's important that irrelevant instructions get the largest FUSION_PRI.  */
19411
19412 static void
19413 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
19414                                int *fusion_pri, int *pri)
19415 {
19416   int tmp, off_val;
19417   rtx base, offset;
19418   enum sched_fusion_type fusion;
19419
19420   gcc_assert (INSN_P (insn));
19421
19422   tmp = max_pri - 1;
19423   fusion = fusion_load_store (insn, &base, &offset);
19424   if (fusion == SCHED_FUSION_NONE)
19425     {
19426       *pri = tmp;
19427       *fusion_pri = tmp;
19428       return;
19429     }
19430
19431   /* Set FUSION_PRI according to fusion type and base register.  */
19432   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
19433
19434   /* Calculate PRI.  */
19435   tmp /= 2;
19436
19437   /* INSN with smaller offset goes first.  */
19438   off_val = (int)(INTVAL (offset));
19439   if (off_val >= 0)
19440     tmp -= (off_val & 0xfffff);
19441   else
19442     tmp += ((- off_val) & 0xfffff);
19443
19444   *pri = tmp;
19445   return;
19446 }
19447
19448 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
19449    Adjust priority of sha1h instructions so they are scheduled before
19450    other SHA1 instructions.  */
19451
19452 static int
19453 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
19454 {
19455   rtx x = PATTERN (insn);
19456
19457   if (GET_CODE (x) == SET)
19458     {
19459       x = SET_SRC (x);
19460
19461       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
19462         return priority + 10;
19463     }
19464
19465   return priority;
19466 }
19467
19468 /* Given OPERANDS of consecutive load/store, check if we can merge
19469    them into ldp/stp.  LOAD is true if they are load instructions.
19470    MODE is the mode of memory operands.  */
19471
19472 bool
19473 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
19474                                 machine_mode mode)
19475 {
19476   HOST_WIDE_INT offval_1, offval_2, msize;
19477   enum reg_class rclass_1, rclass_2;
19478   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
19479
19480   if (load)
19481     {
19482       mem_1 = operands[1];
19483       mem_2 = operands[3];
19484       reg_1 = operands[0];
19485       reg_2 = operands[2];
19486       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
19487       if (REGNO (reg_1) == REGNO (reg_2))
19488         return false;
19489     }
19490   else
19491     {
19492       mem_1 = operands[0];
19493       mem_2 = operands[2];
19494       reg_1 = operands[1];
19495       reg_2 = operands[3];
19496     }
19497
19498   /* The mems cannot be volatile.  */
19499   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
19500     return false;
19501
19502   /* If we have SImode and slow unaligned ldp,
19503      check the alignment to be at least 8 byte. */
19504   if (mode == SImode
19505       && (aarch64_tune_params.extra_tuning_flags
19506           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19507       && !optimize_size
19508       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
19509     return false;
19510
19511   /* Check if the addresses are in the form of [base+offset].  */
19512   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
19513   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
19514     return false;
19515   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
19516   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
19517     return false;
19518
19519   /* Check if the bases are same.  */
19520   if (!rtx_equal_p (base_1, base_2))
19521     return false;
19522
19523   /* The operands must be of the same size.  */
19524   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
19525                          GET_MODE_SIZE (GET_MODE (mem_2))));
19526
19527   offval_1 = INTVAL (offset_1);
19528   offval_2 = INTVAL (offset_2);
19529   /* We should only be trying this for fixed-sized modes.  There is no
19530      SVE LDP/STP instruction.  */
19531   msize = GET_MODE_SIZE (mode).to_constant ();
19532   /* Check if the offsets are consecutive.  */
19533   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
19534     return false;
19535
19536   /* Check if the addresses are clobbered by load.  */
19537   if (load)
19538     {
19539       if (reg_mentioned_p (reg_1, mem_1))
19540         return false;
19541
19542       /* In increasing order, the last load can clobber the address.  */
19543       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
19544         return false;
19545     }
19546
19547   /* One of the memory accesses must be a mempair operand.
19548      If it is not the first one, they need to be swapped by the
19549      peephole.  */
19550   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
19551        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
19552     return false;
19553
19554   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
19555     rclass_1 = FP_REGS;
19556   else
19557     rclass_1 = GENERAL_REGS;
19558
19559   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
19560     rclass_2 = FP_REGS;
19561   else
19562     rclass_2 = GENERAL_REGS;
19563
19564   /* Check if the registers are of same class.  */
19565   if (rclass_1 != rclass_2)
19566     return false;
19567
19568   return true;
19569 }
19570
19571 /* Given OPERANDS of consecutive load/store that can be merged,
19572    swap them if they are not in ascending order.  */
19573 void
19574 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
19575 {
19576   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
19577   HOST_WIDE_INT offval_1, offval_2;
19578
19579   if (load)
19580     {
19581       mem_1 = operands[1];
19582       mem_2 = operands[3];
19583     }
19584   else
19585     {
19586       mem_1 = operands[0];
19587       mem_2 = operands[2];
19588     }
19589
19590   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
19591   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
19592
19593   offval_1 = INTVAL (offset_1);
19594   offval_2 = INTVAL (offset_2);
19595
19596   if (offval_1 > offval_2)
19597     {
19598       /* Irrespective of whether this is a load or a store,
19599          we do the same swap.  */
19600       std::swap (operands[0], operands[2]);
19601       std::swap (operands[1], operands[3]);
19602     }
19603 }
19604
19605 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
19606    comparison between the two.  */
19607 int
19608 aarch64_host_wide_int_compare (const void *x, const void *y)
19609 {
19610   return wi::cmps (* ((const HOST_WIDE_INT *) x),
19611                    * ((const HOST_WIDE_INT *) y));
19612 }
19613
19614 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
19615    other pointing to a REG rtx containing an offset, compare the offsets
19616    of the two pairs.
19617
19618    Return:
19619
19620         1 iff offset (X) > offset (Y)
19621         0 iff offset (X) == offset (Y)
19622         -1 iff offset (X) < offset (Y)  */
19623 int
19624 aarch64_ldrstr_offset_compare (const void *x, const void *y)
19625 {
19626   const rtx * operands_1 = (const rtx *) x;
19627   const rtx * operands_2 = (const rtx *) y;
19628   rtx mem_1, mem_2, base, offset_1, offset_2;
19629
19630   if (MEM_P (operands_1[0]))
19631     mem_1 = operands_1[0];
19632   else
19633     mem_1 = operands_1[1];
19634
19635   if (MEM_P (operands_2[0]))
19636     mem_2 = operands_2[0];
19637   else
19638     mem_2 = operands_2[1];
19639
19640   /* Extract the offsets.  */
19641   extract_base_offset_in_addr (mem_1, &base, &offset_1);
19642   extract_base_offset_in_addr (mem_2, &base, &offset_2);
19643
19644   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
19645
19646   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
19647 }
19648
19649 /* Given OPERANDS of consecutive load/store, check if we can merge
19650    them into ldp/stp by adjusting the offset.  LOAD is true if they
19651    are load instructions.  MODE is the mode of memory operands.
19652
19653    Given below consecutive stores:
19654
19655      str  w1, [xb, 0x100]
19656      str  w1, [xb, 0x104]
19657      str  w1, [xb, 0x108]
19658      str  w1, [xb, 0x10c]
19659
19660    Though the offsets are out of the range supported by stp, we can
19661    still pair them after adjusting the offset, like:
19662
19663      add  scratch, xb, 0x100
19664      stp  w1, w1, [scratch]
19665      stp  w1, w1, [scratch, 0x8]
19666
19667    The peephole patterns detecting this opportunity should guarantee
19668    the scratch register is avaliable.  */
19669
19670 bool
19671 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
19672                                        scalar_mode mode)
19673 {
19674   const int num_insns = 4;
19675   enum reg_class rclass;
19676   HOST_WIDE_INT offvals[num_insns], msize;
19677   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
19678
19679   if (load)
19680     {
19681       for (int i = 0; i < num_insns; i++)
19682         {
19683           reg[i] = operands[2 * i];
19684           mem[i] = operands[2 * i + 1];
19685
19686           gcc_assert (REG_P (reg[i]));
19687         }
19688
19689       /* Do not attempt to merge the loads if the loads clobber each other.  */
19690       for (int i = 0; i < 8; i += 2)
19691         for (int j = i + 2; j < 8; j += 2)
19692           if (reg_overlap_mentioned_p (operands[i], operands[j]))
19693             return false;
19694     }
19695   else
19696     for (int i = 0; i < num_insns; i++)
19697       {
19698         mem[i] = operands[2 * i];
19699         reg[i] = operands[2 * i + 1];
19700       }
19701
19702   /* Skip if memory operand is by itself valid for ldp/stp.  */
19703   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
19704     return false;
19705
19706   for (int i = 0; i < num_insns; i++)
19707     {
19708       /* The mems cannot be volatile.  */
19709       if (MEM_VOLATILE_P (mem[i]))
19710         return false;
19711
19712       /* Check if the addresses are in the form of [base+offset].  */
19713       extract_base_offset_in_addr (mem[i], base + i, offset + i);
19714       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
19715         return false;
19716     }
19717
19718   /* Check if the registers are of same class.  */
19719   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
19720     ? FP_REGS : GENERAL_REGS;
19721
19722   for (int i = 1; i < num_insns; i++)
19723     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
19724       {
19725         if (rclass != FP_REGS)
19726           return false;
19727       }
19728     else
19729       {
19730         if (rclass != GENERAL_REGS)
19731           return false;
19732       }
19733
19734   /* Only the last register in the order in which they occur
19735      may be clobbered by the load.  */
19736   if (rclass == GENERAL_REGS && load)
19737     for (int i = 0; i < num_insns - 1; i++)
19738       if (reg_mentioned_p (reg[i], mem[i]))
19739         return false;
19740
19741   /* Check if the bases are same.  */
19742   for (int i = 0; i < num_insns - 1; i++)
19743     if (!rtx_equal_p (base[i], base[i + 1]))
19744       return false;
19745
19746   for (int i = 0; i < num_insns; i++)
19747     offvals[i] = INTVAL (offset[i]);
19748
19749   msize = GET_MODE_SIZE (mode);
19750
19751   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
19752   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
19753          aarch64_host_wide_int_compare);
19754
19755   if (!(offvals[1] == offvals[0] + msize
19756         && offvals[3] == offvals[2] + msize))
19757     return false;
19758
19759   /* Check that offsets are within range of each other.  The ldp/stp
19760      instructions have 7 bit immediate offsets, so use 0x80.  */
19761   if (offvals[2] - offvals[0] >= msize * 0x80)
19762     return false;
19763
19764   /* The offsets must be aligned with respect to each other.  */
19765   if (offvals[0] % msize != offvals[2] % msize)
19766     return false;
19767
19768   /* If we have SImode and slow unaligned ldp,
19769      check the alignment to be at least 8 byte. */
19770   if (mode == SImode
19771       && (aarch64_tune_params.extra_tuning_flags
19772           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19773       && !optimize_size
19774       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
19775     return false;
19776
19777   return true;
19778 }
19779
19780 /* Given OPERANDS of consecutive load/store, this function pairs them
19781    into LDP/STP after adjusting the offset.  It depends on the fact
19782    that the operands can be sorted so the offsets are correct for STP.
19783    MODE is the mode of memory operands.  CODE is the rtl operator
19784    which should be applied to all memory operands, it's SIGN_EXTEND,
19785    ZERO_EXTEND or UNKNOWN.  */
19786
19787 bool
19788 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
19789                              scalar_mode mode, RTX_CODE code)
19790 {
19791   rtx base, offset_1, offset_3, t1, t2;
19792   rtx mem_1, mem_2, mem_3, mem_4;
19793   rtx temp_operands[8];
19794   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
19795                 stp_off_upper_limit, stp_off_lower_limit, msize;
19796
19797   /* We make changes on a copy as we may still bail out.  */
19798   for (int i = 0; i < 8; i ++)
19799     temp_operands[i] = operands[i];
19800
19801   /* Sort the operands.  */
19802   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
19803
19804   /* Copy the memory operands so that if we have to bail for some
19805      reason the original addresses are unchanged.  */
19806   if (load)
19807     {
19808       mem_1 = copy_rtx (temp_operands[1]);
19809       mem_2 = copy_rtx (temp_operands[3]);
19810       mem_3 = copy_rtx (temp_operands[5]);
19811       mem_4 = copy_rtx (temp_operands[7]);
19812     }
19813   else
19814     {
19815       mem_1 = copy_rtx (temp_operands[0]);
19816       mem_2 = copy_rtx (temp_operands[2]);
19817       mem_3 = copy_rtx (temp_operands[4]);
19818       mem_4 = copy_rtx (temp_operands[6]);
19819       gcc_assert (code == UNKNOWN);
19820     }
19821
19822   extract_base_offset_in_addr (mem_1, &base, &offset_1);
19823   extract_base_offset_in_addr (mem_3, &base, &offset_3);
19824   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
19825               && offset_3 != NULL_RTX);
19826
19827   /* Adjust offset so it can fit in LDP/STP instruction.  */
19828   msize = GET_MODE_SIZE (mode);
19829   stp_off_upper_limit = msize * (0x40 - 1);
19830   stp_off_lower_limit = - msize * 0x40;
19831
19832   off_val_1 = INTVAL (offset_1);
19833   off_val_3 = INTVAL (offset_3);
19834
19835   /* The base offset is optimally half way between the two STP/LDP offsets.  */
19836   if (msize <= 4)
19837     base_off = (off_val_1 + off_val_3) / 2;
19838   else
19839     /* However, due to issues with negative LDP/STP offset generation for
19840        larger modes, for DF, DI and vector modes. we must not use negative
19841        addresses smaller than 9 signed unadjusted bits can store.  This
19842        provides the most range in this case.  */
19843     base_off = off_val_1;
19844
19845   /* Adjust the base so that it is aligned with the addresses but still
19846      optimal.  */
19847   if (base_off % msize != off_val_1 % msize)
19848     /* Fix the offset, bearing in mind we want to make it bigger not
19849        smaller.  */
19850     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19851   else if (msize <= 4)
19852     /* The negative range of LDP/STP is one larger than the positive range.  */
19853     base_off += msize;
19854
19855   /* Check if base offset is too big or too small.  We can attempt to resolve
19856      this issue by setting it to the maximum value and seeing if the offsets
19857      still fit.  */
19858   if (base_off >= 0x1000)
19859     {
19860       base_off = 0x1000 - 1;
19861       /* We must still make sure that the base offset is aligned with respect
19862          to the address.  But it may may not be made any bigger.  */
19863       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19864     }
19865
19866   /* Likewise for the case where the base is too small.  */
19867   if (base_off <= -0x1000)
19868     {
19869       base_off = -0x1000 + 1;
19870       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19871     }
19872
19873   /* Offset of the first STP/LDP.  */
19874   new_off_1 = off_val_1 - base_off;
19875
19876   /* Offset of the second STP/LDP.  */
19877   new_off_3 = off_val_3 - base_off;
19878
19879   /* The offsets must be within the range of the LDP/STP instructions.  */
19880   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
19881       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
19882     return false;
19883
19884   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
19885                                                   new_off_1), true);
19886   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
19887                                                   new_off_1 + msize), true);
19888   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
19889                                                   new_off_3), true);
19890   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
19891                                                   new_off_3 + msize), true);
19892
19893   if (!aarch64_mem_pair_operand (mem_1, mode)
19894       || !aarch64_mem_pair_operand (mem_3, mode))
19895     return false;
19896
19897   if (code == ZERO_EXTEND)
19898     {
19899       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
19900       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
19901       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
19902       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
19903     }
19904   else if (code == SIGN_EXTEND)
19905     {
19906       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
19907       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
19908       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
19909       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
19910     }
19911
19912   if (load)
19913     {
19914       operands[0] = temp_operands[0];
19915       operands[1] = mem_1;
19916       operands[2] = temp_operands[2];
19917       operands[3] = mem_2;
19918       operands[4] = temp_operands[4];
19919       operands[5] = mem_3;
19920       operands[6] = temp_operands[6];
19921       operands[7] = mem_4;
19922     }
19923   else
19924     {
19925       operands[0] = mem_1;
19926       operands[1] = temp_operands[1];
19927       operands[2] = mem_2;
19928       operands[3] = temp_operands[3];
19929       operands[4] = mem_3;
19930       operands[5] = temp_operands[5];
19931       operands[6] = mem_4;
19932       operands[7] = temp_operands[7];
19933     }
19934
19935   /* Emit adjusting instruction.  */
19936   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
19937   /* Emit ldp/stp instructions.  */
19938   t1 = gen_rtx_SET (operands[0], operands[1]);
19939   t2 = gen_rtx_SET (operands[2], operands[3]);
19940   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19941   t1 = gen_rtx_SET (operands[4], operands[5]);
19942   t2 = gen_rtx_SET (operands[6], operands[7]);
19943   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19944   return true;
19945 }
19946
19947 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
19948    it isn't worth branching around empty masked ops (including masked
19949    stores).  */
19950
19951 static bool
19952 aarch64_empty_mask_is_expensive (unsigned)
19953 {
19954   return false;
19955 }
19956
19957 /* Return 1 if pseudo register should be created and used to hold
19958    GOT address for PIC code.  */
19959
19960 bool
19961 aarch64_use_pseudo_pic_reg (void)
19962 {
19963   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
19964 }
19965
19966 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
19967
19968 static int
19969 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
19970 {
19971   switch (XINT (x, 1))
19972     {
19973     case UNSPEC_GOTSMALLPIC:
19974     case UNSPEC_GOTSMALLPIC28K:
19975     case UNSPEC_GOTTINYPIC:
19976       return 0;
19977     default:
19978       break;
19979     }
19980
19981   return default_unspec_may_trap_p (x, flags);
19982 }
19983
19984
19985 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
19986    return the log2 of that value.  Otherwise return -1.  */
19987
19988 int
19989 aarch64_fpconst_pow_of_2 (rtx x)
19990 {
19991   const REAL_VALUE_TYPE *r;
19992
19993   if (!CONST_DOUBLE_P (x))
19994     return -1;
19995
19996   r = CONST_DOUBLE_REAL_VALUE (x);
19997
19998   if (REAL_VALUE_NEGATIVE (*r)
19999       || REAL_VALUE_ISNAN (*r)
20000       || REAL_VALUE_ISINF (*r)
20001       || !real_isinteger (r, DFmode))
20002     return -1;
20003
20004   return exact_log2 (real_to_integer (r));
20005 }
20006
20007 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
20008    power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
20009    return n. Otherwise return -1.  */
20010
20011 int
20012 aarch64_fpconst_pow2_recip (rtx x)
20013 {
20014   REAL_VALUE_TYPE r0;
20015
20016   if (!CONST_DOUBLE_P (x))
20017     return -1;
20018
20019   r0 = *CONST_DOUBLE_REAL_VALUE (x);
20020   if (exact_real_inverse (DFmode, &r0)
20021       && !REAL_VALUE_NEGATIVE (r0))
20022     {
20023         int ret = exact_log2 (real_to_integer (&r0));
20024         if (ret >= 1 && ret <= 32)
20025             return ret;
20026     }
20027   return -1;
20028 }
20029
20030 /* If X is a vector of equal CONST_DOUBLE values and that value is
20031    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
20032
20033 int
20034 aarch64_vec_fpconst_pow_of_2 (rtx x)
20035 {
20036   int nelts;
20037   if (GET_CODE (x) != CONST_VECTOR
20038       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
20039     return -1;
20040
20041   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
20042     return -1;
20043
20044   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
20045   if (firstval <= 0)
20046     return -1;
20047
20048   for (int i = 1; i < nelts; i++)
20049     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
20050       return -1;
20051
20052   return firstval;
20053 }
20054
20055 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
20056    to float.
20057
20058    __fp16 always promotes through this hook.
20059    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
20060    through the generic excess precision logic rather than here.  */
20061
20062 static tree
20063 aarch64_promoted_type (const_tree t)
20064 {
20065   if (SCALAR_FLOAT_TYPE_P (t)
20066       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
20067     return float_type_node;
20068
20069   return NULL_TREE;
20070 }
20071
20072 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
20073
20074 static bool
20075 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
20076                            optimization_type opt_type)
20077 {
20078   switch (op)
20079     {
20080     case rsqrt_optab:
20081       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
20082
20083     default:
20084       return true;
20085     }
20086 }
20087
20088 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
20089
20090 static unsigned int
20091 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
20092                                         int *offset)
20093 {
20094   /* Polynomial invariant 1 == (VG / 2) - 1.  */
20095   gcc_assert (i == 1);
20096   *factor = 2;
20097   *offset = 1;
20098   return AARCH64_DWARF_VG;
20099 }
20100
20101 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
20102    if MODE is HFmode, and punt to the generic implementation otherwise.  */
20103
20104 static bool
20105 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
20106 {
20107   return (mode == HFmode
20108           ? true
20109           : default_libgcc_floating_mode_supported_p (mode));
20110 }
20111
20112 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
20113    if MODE is HFmode, and punt to the generic implementation otherwise.  */
20114
20115 static bool
20116 aarch64_scalar_mode_supported_p (scalar_mode mode)
20117 {
20118   return (mode == HFmode
20119           ? true
20120           : default_scalar_mode_supported_p (mode));
20121 }
20122
20123 /* Set the value of FLT_EVAL_METHOD.
20124    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
20125
20126     0: evaluate all operations and constants, whose semantic type has at
20127        most the range and precision of type float, to the range and
20128        precision of float; evaluate all other operations and constants to
20129        the range and precision of the semantic type;
20130
20131     N, where _FloatN is a supported interchange floating type
20132        evaluate all operations and constants, whose semantic type has at
20133        most the range and precision of _FloatN type, to the range and
20134        precision of the _FloatN type; evaluate all other operations and
20135        constants to the range and precision of the semantic type;
20136
20137    If we have the ARMv8.2-A extensions then we support _Float16 in native
20138    precision, so we should set this to 16.  Otherwise, we support the type,
20139    but want to evaluate expressions in float precision, so set this to
20140    0.  */
20141
20142 static enum flt_eval_method
20143 aarch64_excess_precision (enum excess_precision_type type)
20144 {
20145   switch (type)
20146     {
20147       case EXCESS_PRECISION_TYPE_FAST:
20148       case EXCESS_PRECISION_TYPE_STANDARD:
20149         /* We can calculate either in 16-bit range and precision or
20150            32-bit range and precision.  Make that decision based on whether
20151            we have native support for the ARMv8.2-A 16-bit floating-point
20152            instructions or not.  */
20153         return (TARGET_FP_F16INST
20154                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
20155                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
20156       case EXCESS_PRECISION_TYPE_IMPLICIT:
20157         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
20158       default:
20159         gcc_unreachable ();
20160     }
20161   return FLT_EVAL_METHOD_UNPREDICTABLE;
20162 }
20163
20164 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
20165    scheduled for speculative execution.  Reject the long-running division
20166    and square-root instructions.  */
20167
20168 static bool
20169 aarch64_sched_can_speculate_insn (rtx_insn *insn)
20170 {
20171   switch (get_attr_type (insn))
20172     {
20173       case TYPE_SDIV:
20174       case TYPE_UDIV:
20175       case TYPE_FDIVS:
20176       case TYPE_FDIVD:
20177       case TYPE_FSQRTS:
20178       case TYPE_FSQRTD:
20179       case TYPE_NEON_FP_SQRT_S:
20180       case TYPE_NEON_FP_SQRT_D:
20181       case TYPE_NEON_FP_SQRT_S_Q:
20182       case TYPE_NEON_FP_SQRT_D_Q:
20183       case TYPE_NEON_FP_DIV_S:
20184       case TYPE_NEON_FP_DIV_D:
20185       case TYPE_NEON_FP_DIV_S_Q:
20186       case TYPE_NEON_FP_DIV_D_Q:
20187         return false;
20188       default:
20189         return true;
20190     }
20191 }
20192
20193 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
20194
20195 static int
20196 aarch64_compute_pressure_classes (reg_class *classes)
20197 {
20198   int i = 0;
20199   classes[i++] = GENERAL_REGS;
20200   classes[i++] = FP_REGS;
20201   /* PR_REGS isn't a useful pressure class because many predicate pseudo
20202      registers need to go in PR_LO_REGS at some point during their
20203      lifetime.  Splitting it into two halves has the effect of making
20204      all predicates count against PR_LO_REGS, so that we try whenever
20205      possible to restrict the number of live predicates to 8.  This
20206      greatly reduces the amount of spilling in certain loops.  */
20207   classes[i++] = PR_LO_REGS;
20208   classes[i++] = PR_HI_REGS;
20209   return i;
20210 }
20211
20212 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
20213
20214 static bool
20215 aarch64_can_change_mode_class (machine_mode from,
20216                                machine_mode to, reg_class_t)
20217 {
20218   if (BYTES_BIG_ENDIAN)
20219     {
20220       bool from_sve_p = aarch64_sve_data_mode_p (from);
20221       bool to_sve_p = aarch64_sve_data_mode_p (to);
20222
20223       /* Don't allow changes between SVE data modes and non-SVE modes.
20224          See the comment at the head of aarch64-sve.md for details.  */
20225       if (from_sve_p != to_sve_p)
20226         return false;
20227
20228       /* Don't allow changes in element size: lane 0 of the new vector
20229          would not then be lane 0 of the old vector.  See the comment
20230          above aarch64_maybe_expand_sve_subreg_move for a more detailed
20231          description.
20232
20233          In the worst case, this forces a register to be spilled in
20234          one mode and reloaded in the other, which handles the
20235          endianness correctly.  */
20236       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
20237         return false;
20238     }
20239   return true;
20240 }
20241
20242 /* Implement TARGET_EARLY_REMAT_MODES.  */
20243
20244 static void
20245 aarch64_select_early_remat_modes (sbitmap modes)
20246 {
20247   /* SVE values are not normally live across a call, so it should be
20248      worth doing early rematerialization even in VL-specific mode.  */
20249   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
20250     if (aarch64_sve_mode_p ((machine_mode) i))
20251       bitmap_set_bit (modes, i);
20252 }
20253
20254 /* Override the default target speculation_safe_value.  */
20255 static rtx
20256 aarch64_speculation_safe_value (machine_mode mode,
20257                                 rtx result, rtx val, rtx failval)
20258 {
20259   /* Maybe we should warn if falling back to hard barriers.  They are
20260      likely to be noticably more expensive than the alternative below.  */
20261   if (!aarch64_track_speculation)
20262     return default_speculation_safe_value (mode, result, val, failval);
20263
20264   if (!REG_P (val))
20265     val = copy_to_mode_reg (mode, val);
20266
20267   if (!aarch64_reg_or_zero (failval, mode))
20268     failval = copy_to_mode_reg (mode, failval);
20269
20270   emit_insn (gen_despeculate_copy (mode, result, val, failval));
20271   return result;
20272 }
20273
20274 /* Implement TARGET_ESTIMATED_POLY_VALUE.
20275    Look into the tuning structure for an estimate.
20276    VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
20277    Advanced SIMD 128 bits.  */
20278
20279 static HOST_WIDE_INT
20280 aarch64_estimated_poly_value (poly_int64 val)
20281 {
20282   enum aarch64_sve_vector_bits_enum width_source
20283     = aarch64_tune_params.sve_width;
20284
20285   /* If we still don't have an estimate, use the default.  */
20286   if (width_source == SVE_SCALABLE)
20287     return default_estimated_poly_value (val);
20288
20289   HOST_WIDE_INT over_128 = width_source - 128;
20290   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
20291 }
20292
20293
20294 /* Return true for types that could be supported as SIMD return or
20295    argument types.  */
20296
20297 static bool
20298 supported_simd_type (tree t)
20299 {
20300   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
20301     {
20302       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
20303       return s == 1 || s == 2 || s == 4 || s == 8;
20304     }
20305   return false;
20306 }
20307
20308 /* Return true for types that currently are supported as SIMD return
20309    or argument types.  */
20310
20311 static bool
20312 currently_supported_simd_type (tree t, tree b)
20313 {
20314   if (COMPLEX_FLOAT_TYPE_P (t))
20315     return false;
20316
20317   if (TYPE_SIZE (t) != TYPE_SIZE (b))
20318     return false;
20319
20320   return supported_simd_type (t);
20321 }
20322
20323 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
20324
20325 static int
20326 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
20327                                         struct cgraph_simd_clone *clonei,
20328                                         tree base_type, int num)
20329 {
20330   tree t, ret_type, arg_type;
20331   unsigned int elt_bits, vec_bits, count;
20332
20333   if (!TARGET_SIMD)
20334     return 0;
20335
20336   if (clonei->simdlen
20337       && (clonei->simdlen < 2
20338           || clonei->simdlen > 1024
20339           || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
20340     {
20341       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20342                   "unsupported simdlen %d", clonei->simdlen);
20343       return 0;
20344     }
20345
20346   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
20347   if (TREE_CODE (ret_type) != VOID_TYPE
20348       && !currently_supported_simd_type (ret_type, base_type))
20349     {
20350       if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
20351         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20352                     "GCC does not currently support mixed size types "
20353                     "for %<simd%> functions");
20354       else if (supported_simd_type (ret_type))
20355         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20356                     "GCC does not currently support return type %qT "
20357                     "for %<simd%> functions", ret_type);
20358       else
20359         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20360                     "unsupported return type %qT for %<simd%> functions",
20361                     ret_type);
20362       return 0;
20363     }
20364
20365   for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
20366     {
20367       arg_type = TREE_TYPE (t);
20368
20369       if (!currently_supported_simd_type (arg_type, base_type))
20370         {
20371           if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
20372             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20373                         "GCC does not currently support mixed size types "
20374                         "for %<simd%> functions");
20375           else
20376             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20377                         "GCC does not currently support argument type %qT "
20378                         "for %<simd%> functions", arg_type);
20379           return 0;
20380         }
20381     }
20382
20383   clonei->vecsize_mangle = 'n';
20384   clonei->mask_mode = VOIDmode;
20385   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
20386   if (clonei->simdlen == 0)
20387     {
20388       count = 2;
20389       vec_bits = (num == 0 ? 64 : 128);
20390       clonei->simdlen = vec_bits / elt_bits;
20391     }
20392   else
20393     {
20394       count = 1;
20395       vec_bits = clonei->simdlen * elt_bits;
20396       if (vec_bits != 64 && vec_bits != 128)
20397         {
20398           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20399                       "GCC does not currently support simdlen %d for type %qT",
20400                       clonei->simdlen, base_type);
20401           return 0;
20402         }
20403     }
20404   clonei->vecsize_int = vec_bits;
20405   clonei->vecsize_float = vec_bits;
20406   return count;
20407 }
20408
20409 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
20410
20411 static void
20412 aarch64_simd_clone_adjust (struct cgraph_node *node)
20413 {
20414   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
20415      use the correct ABI.  */
20416
20417   tree t = TREE_TYPE (node->decl);
20418   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
20419                                         TYPE_ATTRIBUTES (t));
20420 }
20421
20422 /* Implement TARGET_SIMD_CLONE_USABLE.  */
20423
20424 static int
20425 aarch64_simd_clone_usable (struct cgraph_node *node)
20426 {
20427   switch (node->simdclone->vecsize_mangle)
20428     {
20429     case 'n':
20430       if (!TARGET_SIMD)
20431         return -1;
20432       return 0;
20433     default:
20434       gcc_unreachable ();
20435     }
20436 }
20437
20438 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
20439
20440 static int
20441 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
20442 {
20443   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
20444       != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
20445     return 0;
20446   return 1;
20447 }
20448
20449 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
20450
20451 static const char *
20452 aarch64_get_multilib_abi_name (void)
20453 {
20454   if (TARGET_BIG_END)
20455     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
20456   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
20457 }
20458
20459 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
20460    global variable based guard use the default else
20461    return a null tree.  */
20462 static tree
20463 aarch64_stack_protect_guard (void)
20464 {
20465   if (aarch64_stack_protector_guard == SSP_GLOBAL)
20466     return default_stack_protect_guard ();
20467
20468   return NULL_TREE;
20469 }
20470
20471 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
20472    section at the end if needed.  */
20473 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND      0xc0000000
20474 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI      (1U << 0)
20475 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC      (1U << 1)
20476 void
20477 aarch64_file_end_indicate_exec_stack ()
20478 {
20479   file_end_indicate_exec_stack ();
20480
20481   unsigned feature_1_and = 0;
20482   if (aarch64_bti_enabled ())
20483     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
20484
20485   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
20486     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
20487
20488   if (feature_1_and)
20489     {
20490       /* Generate .note.gnu.property section.  */
20491       switch_to_section (get_section (".note.gnu.property",
20492                                       SECTION_NOTYPE, NULL));
20493
20494       /* PT_NOTE header: namesz, descsz, type.
20495          namesz = 4 ("GNU\0")
20496          descsz = 16 (Size of the program property array)
20497                   [(12 + padding) * Number of array elements]
20498          type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
20499       assemble_align (POINTER_SIZE);
20500       assemble_integer (GEN_INT (4), 4, 32, 1);
20501       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
20502       assemble_integer (GEN_INT (5), 4, 32, 1);
20503
20504       /* PT_NOTE name.  */
20505       assemble_string ("GNU", 4);
20506
20507       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
20508          type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
20509          datasz = 4
20510          data   = feature_1_and.  */
20511       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
20512       assemble_integer (GEN_INT (4), 4, 32, 1);
20513       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
20514
20515       /* Pad the size of the note to the required alignment.  */
20516       assemble_align (POINTER_SIZE);
20517     }
20518 }
20519 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
20520 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
20521 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
20522
20523 /* Target-specific selftests.  */
20524
20525 #if CHECKING_P
20526
20527 namespace selftest {
20528
20529 /* Selftest for the RTL loader.
20530    Verify that the RTL loader copes with a dump from
20531    print_rtx_function.  This is essentially just a test that class
20532    function_reader can handle a real dump, but it also verifies
20533    that lookup_reg_by_dump_name correctly handles hard regs.
20534    The presence of hard reg names in the dump means that the test is
20535    target-specific, hence it is in this file.  */
20536
20537 static void
20538 aarch64_test_loading_full_dump ()
20539 {
20540   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
20541
20542   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
20543
20544   rtx_insn *insn_1 = get_insn_by_uid (1);
20545   ASSERT_EQ (NOTE, GET_CODE (insn_1));
20546
20547   rtx_insn *insn_15 = get_insn_by_uid (15);
20548   ASSERT_EQ (INSN, GET_CODE (insn_15));
20549   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
20550
20551   /* Verify crtl->return_rtx.  */
20552   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
20553   ASSERT_EQ (0, REGNO (crtl->return_rtx));
20554   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
20555 }
20556
20557 /* Run all target-specific selftests.  */
20558
20559 static void
20560 aarch64_run_selftests (void)
20561 {
20562   aarch64_test_loading_full_dump ();
20563 }
20564
20565 } // namespace selftest
20566
20567 #endif /* #if CHECKING_P */
20568
20569 #undef TARGET_STACK_PROTECT_GUARD
20570 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
20571
20572 #undef TARGET_ADDRESS_COST
20573 #define TARGET_ADDRESS_COST aarch64_address_cost
20574
20575 /* This hook will determines whether unnamed bitfields affect the alignment
20576    of the containing structure.  The hook returns true if the structure
20577    should inherit the alignment requirements of an unnamed bitfield's
20578    type.  */
20579 #undef TARGET_ALIGN_ANON_BITFIELD
20580 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
20581
20582 #undef TARGET_ASM_ALIGNED_DI_OP
20583 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
20584
20585 #undef TARGET_ASM_ALIGNED_HI_OP
20586 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
20587
20588 #undef TARGET_ASM_ALIGNED_SI_OP
20589 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
20590
20591 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
20592 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
20593   hook_bool_const_tree_hwi_hwi_const_tree_true
20594
20595 #undef TARGET_ASM_FILE_START
20596 #define TARGET_ASM_FILE_START aarch64_start_file
20597
20598 #undef TARGET_ASM_OUTPUT_MI_THUNK
20599 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
20600
20601 #undef TARGET_ASM_SELECT_RTX_SECTION
20602 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
20603
20604 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
20605 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
20606
20607 #undef TARGET_BUILD_BUILTIN_VA_LIST
20608 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
20609
20610 #undef TARGET_CALLEE_COPIES
20611 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
20612
20613 #undef TARGET_CAN_ELIMINATE
20614 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
20615
20616 #undef TARGET_CAN_INLINE_P
20617 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
20618
20619 #undef TARGET_CANNOT_FORCE_CONST_MEM
20620 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
20621
20622 #undef TARGET_CASE_VALUES_THRESHOLD
20623 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
20624
20625 #undef TARGET_CONDITIONAL_REGISTER_USAGE
20626 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
20627
20628 /* Only the least significant bit is used for initialization guard
20629    variables.  */
20630 #undef TARGET_CXX_GUARD_MASK_BIT
20631 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
20632
20633 #undef TARGET_C_MODE_FOR_SUFFIX
20634 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
20635
20636 #ifdef TARGET_BIG_ENDIAN_DEFAULT
20637 #undef  TARGET_DEFAULT_TARGET_FLAGS
20638 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
20639 #endif
20640
20641 #undef TARGET_CLASS_MAX_NREGS
20642 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
20643
20644 #undef TARGET_BUILTIN_DECL
20645 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
20646
20647 #undef TARGET_BUILTIN_RECIPROCAL
20648 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
20649
20650 #undef TARGET_C_EXCESS_PRECISION
20651 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
20652
20653 #undef  TARGET_EXPAND_BUILTIN
20654 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
20655
20656 #undef TARGET_EXPAND_BUILTIN_VA_START
20657 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
20658
20659 #undef TARGET_FOLD_BUILTIN
20660 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
20661
20662 #undef TARGET_FUNCTION_ARG
20663 #define TARGET_FUNCTION_ARG aarch64_function_arg
20664
20665 #undef TARGET_FUNCTION_ARG_ADVANCE
20666 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
20667
20668 #undef TARGET_FUNCTION_ARG_BOUNDARY
20669 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
20670
20671 #undef TARGET_FUNCTION_ARG_PADDING
20672 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
20673
20674 #undef TARGET_GET_RAW_RESULT_MODE
20675 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
20676 #undef TARGET_GET_RAW_ARG_MODE
20677 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
20678
20679 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
20680 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
20681
20682 #undef TARGET_FUNCTION_VALUE
20683 #define TARGET_FUNCTION_VALUE aarch64_function_value
20684
20685 #undef TARGET_FUNCTION_VALUE_REGNO_P
20686 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
20687
20688 #undef TARGET_GIMPLE_FOLD_BUILTIN
20689 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
20690
20691 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
20692 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
20693
20694 #undef  TARGET_INIT_BUILTINS
20695 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
20696
20697 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
20698 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
20699   aarch64_ira_change_pseudo_allocno_class
20700
20701 #undef TARGET_LEGITIMATE_ADDRESS_P
20702 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
20703
20704 #undef TARGET_LEGITIMATE_CONSTANT_P
20705 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
20706
20707 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
20708 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
20709   aarch64_legitimize_address_displacement
20710
20711 #undef TARGET_LIBGCC_CMP_RETURN_MODE
20712 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
20713
20714 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
20715 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
20716 aarch64_libgcc_floating_mode_supported_p
20717
20718 #undef TARGET_MANGLE_TYPE
20719 #define TARGET_MANGLE_TYPE aarch64_mangle_type
20720
20721 #undef TARGET_MEMORY_MOVE_COST
20722 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
20723
20724 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
20725 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
20726
20727 #undef TARGET_MUST_PASS_IN_STACK
20728 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
20729
20730 /* This target hook should return true if accesses to volatile bitfields
20731    should use the narrowest mode possible.  It should return false if these
20732    accesses should use the bitfield container type.  */
20733 #undef TARGET_NARROW_VOLATILE_BITFIELD
20734 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
20735
20736 #undef  TARGET_OPTION_OVERRIDE
20737 #define TARGET_OPTION_OVERRIDE aarch64_override_options
20738
20739 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
20740 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
20741   aarch64_override_options_after_change
20742
20743 #undef TARGET_OPTION_SAVE
20744 #define TARGET_OPTION_SAVE aarch64_option_save
20745
20746 #undef TARGET_OPTION_RESTORE
20747 #define TARGET_OPTION_RESTORE aarch64_option_restore
20748
20749 #undef TARGET_OPTION_PRINT
20750 #define TARGET_OPTION_PRINT aarch64_option_print
20751
20752 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
20753 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
20754
20755 #undef TARGET_SET_CURRENT_FUNCTION
20756 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
20757
20758 #undef TARGET_PASS_BY_REFERENCE
20759 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
20760
20761 #undef TARGET_PREFERRED_RELOAD_CLASS
20762 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
20763
20764 #undef TARGET_SCHED_REASSOCIATION_WIDTH
20765 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
20766
20767 #undef TARGET_PROMOTED_TYPE
20768 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
20769
20770 #undef TARGET_SECONDARY_RELOAD
20771 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
20772
20773 #undef TARGET_SHIFT_TRUNCATION_MASK
20774 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
20775
20776 #undef TARGET_SETUP_INCOMING_VARARGS
20777 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
20778
20779 #undef TARGET_STRUCT_VALUE_RTX
20780 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
20781
20782 #undef TARGET_REGISTER_MOVE_COST
20783 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
20784
20785 #undef TARGET_RETURN_IN_MEMORY
20786 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
20787
20788 #undef TARGET_RETURN_IN_MSB
20789 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
20790
20791 #undef TARGET_RTX_COSTS
20792 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
20793
20794 #undef TARGET_SCALAR_MODE_SUPPORTED_P
20795 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
20796
20797 #undef TARGET_SCHED_ISSUE_RATE
20798 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
20799
20800 #undef TARGET_SCHED_VARIABLE_ISSUE
20801 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
20802
20803 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
20804 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
20805   aarch64_sched_first_cycle_multipass_dfa_lookahead
20806
20807 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
20808 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
20809   aarch64_first_cycle_multipass_dfa_lookahead_guard
20810
20811 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
20812 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
20813   aarch64_get_separate_components
20814
20815 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
20816 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
20817   aarch64_components_for_bb
20818
20819 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
20820 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
20821   aarch64_disqualify_components
20822
20823 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
20824 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
20825   aarch64_emit_prologue_components
20826
20827 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
20828 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
20829   aarch64_emit_epilogue_components
20830
20831 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
20832 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
20833   aarch64_set_handled_components
20834
20835 #undef TARGET_TRAMPOLINE_INIT
20836 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
20837
20838 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
20839 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
20840
20841 #undef TARGET_VECTOR_MODE_SUPPORTED_P
20842 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
20843
20844 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
20845 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
20846   aarch64_builtin_support_vector_misalignment
20847
20848 #undef TARGET_ARRAY_MODE
20849 #define TARGET_ARRAY_MODE aarch64_array_mode
20850
20851 #undef TARGET_ARRAY_MODE_SUPPORTED_P
20852 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
20853
20854 #undef TARGET_VECTORIZE_ADD_STMT_COST
20855 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
20856
20857 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
20858 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
20859   aarch64_builtin_vectorization_cost
20860
20861 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
20862 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
20863
20864 #undef TARGET_VECTORIZE_BUILTINS
20865 #define TARGET_VECTORIZE_BUILTINS
20866
20867 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
20868 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
20869   aarch64_builtin_vectorized_function
20870
20871 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
20872 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
20873   aarch64_autovectorize_vector_sizes
20874
20875 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
20876 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
20877   aarch64_atomic_assign_expand_fenv
20878
20879 /* Section anchor support.  */
20880
20881 #undef TARGET_MIN_ANCHOR_OFFSET
20882 #define TARGET_MIN_ANCHOR_OFFSET -256
20883
20884 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
20885    byte offset; we can do much more for larger data types, but have no way
20886    to determine the size of the access.  We assume accesses are aligned.  */
20887 #undef TARGET_MAX_ANCHOR_OFFSET
20888 #define TARGET_MAX_ANCHOR_OFFSET 4095
20889
20890 #undef TARGET_VECTOR_ALIGNMENT
20891 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
20892
20893 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
20894 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
20895   aarch64_vectorize_preferred_vector_alignment
20896 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
20897 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
20898   aarch64_simd_vector_alignment_reachable
20899
20900 /* vec_perm support.  */
20901
20902 #undef TARGET_VECTORIZE_VEC_PERM_CONST
20903 #define TARGET_VECTORIZE_VEC_PERM_CONST \
20904   aarch64_vectorize_vec_perm_const
20905
20906 #undef TARGET_VECTORIZE_GET_MASK_MODE
20907 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
20908 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
20909 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
20910   aarch64_empty_mask_is_expensive
20911 #undef TARGET_PREFERRED_ELSE_VALUE
20912 #define TARGET_PREFERRED_ELSE_VALUE \
20913   aarch64_preferred_else_value
20914
20915 #undef TARGET_INIT_LIBFUNCS
20916 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
20917
20918 #undef TARGET_FIXED_CONDITION_CODE_REGS
20919 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
20920
20921 #undef TARGET_FLAGS_REGNUM
20922 #define TARGET_FLAGS_REGNUM CC_REGNUM
20923
20924 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
20925 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
20926
20927 #undef TARGET_ASAN_SHADOW_OFFSET
20928 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
20929
20930 #undef TARGET_LEGITIMIZE_ADDRESS
20931 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
20932
20933 #undef TARGET_SCHED_CAN_SPECULATE_INSN
20934 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
20935
20936 #undef TARGET_CAN_USE_DOLOOP_P
20937 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
20938
20939 #undef TARGET_SCHED_ADJUST_PRIORITY
20940 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
20941
20942 #undef TARGET_SCHED_MACRO_FUSION_P
20943 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
20944
20945 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
20946 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
20947
20948 #undef TARGET_SCHED_FUSION_PRIORITY
20949 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
20950
20951 #undef TARGET_UNSPEC_MAY_TRAP_P
20952 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
20953
20954 #undef TARGET_USE_PSEUDO_PIC_REG
20955 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
20956
20957 #undef TARGET_PRINT_OPERAND
20958 #define TARGET_PRINT_OPERAND aarch64_print_operand
20959
20960 #undef TARGET_PRINT_OPERAND_ADDRESS
20961 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
20962
20963 #undef TARGET_OPTAB_SUPPORTED_P
20964 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
20965
20966 #undef TARGET_OMIT_STRUCT_RETURN_REG
20967 #define TARGET_OMIT_STRUCT_RETURN_REG true
20968
20969 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
20970 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
20971   aarch64_dwarf_poly_indeterminate_value
20972
20973 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
20974 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
20975 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
20976
20977 #undef TARGET_HARD_REGNO_NREGS
20978 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
20979 #undef TARGET_HARD_REGNO_MODE_OK
20980 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
20981
20982 #undef TARGET_MODES_TIEABLE_P
20983 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
20984
20985 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
20986 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
20987   aarch64_hard_regno_call_part_clobbered
20988
20989 #undef TARGET_INSN_CALLEE_ABI
20990 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
20991
20992 #undef TARGET_CONSTANT_ALIGNMENT
20993 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
20994
20995 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
20996 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
20997   aarch64_stack_clash_protection_alloca_probe_range
20998
20999 #undef TARGET_COMPUTE_PRESSURE_CLASSES
21000 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
21001
21002 #undef TARGET_CAN_CHANGE_MODE_CLASS
21003 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
21004
21005 #undef TARGET_SELECT_EARLY_REMAT_MODES
21006 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
21007
21008 #undef TARGET_SPECULATION_SAFE_VALUE
21009 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
21010
21011 #undef TARGET_ESTIMATED_POLY_VALUE
21012 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
21013
21014 #undef TARGET_ATTRIBUTE_TABLE
21015 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
21016
21017 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
21018 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
21019   aarch64_simd_clone_compute_vecsize_and_simdlen
21020
21021 #undef TARGET_SIMD_CLONE_ADJUST
21022 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
21023
21024 #undef TARGET_SIMD_CLONE_USABLE
21025 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
21026
21027 #undef TARGET_COMP_TYPE_ATTRIBUTES
21028 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
21029
21030 #undef TARGET_GET_MULTILIB_ABI_NAME
21031 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
21032
21033 #undef TARGET_FNTYPE_ABI
21034 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
21035
21036 #if CHECKING_P
21037 #undef TARGET_RUN_TARGET_SELFTESTS
21038 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
21039 #endif /* #if CHECKING_P */
21040
21041 #undef TARGET_ASM_POST_CFI_STARTPROC
21042 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
21043
21044 struct gcc_target targetm = TARGET_INITIALIZER;
21045
21046 #include "gt-aarch64.h"