gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2019 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "cgraph.h"
  44 #include "diagnostic.h"
  45 #include "insn-attr.h"
  46 #include "alias.h"
  47 #include "fold-const.h"
  48 #include "stor-layout.h"
  49 #include "calls.h"
  50 #include "varasm.h"
  51 #include "output.h"
  52 #include "flags.h"
  53 #include "explow.h"
  54 #include "expr.h"
  55 #include "reload.h"
  56 #include "langhooks.h"
  57 #include "opts.h"
  58 #include "params.h"
  59 #include "gimplify.h"
  60 #include "dwarf2.h"
  61 #include "gimple-iterator.h"
  62 #include "tree-vectorizer.h"
  63 #include "aarch64-cost-tables.h"
  64 #include "dumpfile.h"
  65 #include "builtins.h"
  66 #include "rtl-iter.h"
  67 #include "tm-constrs.h"
  68 #include "sched-int.h"
  69 #include "target-globals.h"
  70 #include "common/common-target.h"
  71 #include "cfgrtl.h"
  72 #include "selftest.h"
  73 #include "selftest-rtl.h"
  74 #include "rtx-vector-builder.h"
  75 #include "intl.h"
  76
  77 /* This file should be included last.  */
  78 #include "target-def.h"
  79
  80 /* Defined for convenience.  */
  81 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  82
  83 /* Information about a legitimate vector immediate operand.  */
  84 struct simd_immediate_info
  85 {
  86   enum insn_type { MOV, MVN, INDEX, PTRUE };
  87   enum modifier_type { LSL, MSL };
  88
  89   simd_immediate_info () {}
  90   simd_immediate_info (scalar_float_mode, rtx);
  91   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
  92                        insn_type = MOV, modifier_type = LSL,
  93                        unsigned int = 0);
  94   simd_immediate_info (scalar_mode, rtx, rtx);
  95   simd_immediate_info (scalar_int_mode, aarch64_svpattern);
  96
  97   /* The mode of the elements.  */
  98   scalar_mode elt_mode;
  99
 100   /* The instruction to use to move the immediate into a vector.  */
 101   insn_type insn;
 102
 103   union
 104   {
 105     /* For MOV and MVN.  */
 106     struct
 107     {
 108       /* The value of each element.  */
 109       rtx value;
 110
 111       /* The kind of shift modifier to use, and the number of bits to shift.
 112          This is (LSL, 0) if no shift is needed.  */
 113       modifier_type modifier;
 114       unsigned int shift;
 115     } mov;
 116
 117     /* For INDEX.  */
 118     struct
 119     {
 120       /* The value of the first element and the step to be added for each
 121          subsequent element.  */
 122       rtx base, step;
 123     } index;
 124
 125     /* For PTRUE.  */
 126     aarch64_svpattern pattern;
 127   } u;
 128 };
 129
 130 /* Construct a floating-point immediate in which each element has mode
 131    ELT_MODE_IN and value VALUE_IN.  */
 132 inline simd_immediate_info
 133 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 134   : elt_mode (elt_mode_in), insn (MOV)
 135 {
 136   u.mov.value = value_in;
 137   u.mov.modifier = LSL;
 138   u.mov.shift = 0;
 139 }
 140
 141 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 142    and value VALUE_IN.  The other parameters are as for the structure
 143    fields.  */
 144 inline simd_immediate_info
 145 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 146                        unsigned HOST_WIDE_INT value_in,
 147                        insn_type insn_in, modifier_type modifier_in,
 148                        unsigned int shift_in)
 149   : elt_mode (elt_mode_in), insn (insn_in)
 150 {
 151   u.mov.value = gen_int_mode (value_in, elt_mode_in);
 152   u.mov.modifier = modifier_in;
 153   u.mov.shift = shift_in;
 154 }
 155
 156 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 157    and where element I is equal to BASE_IN + I * STEP_IN.  */
 158 inline simd_immediate_info
 159 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
 160   : elt_mode (elt_mode_in), insn (INDEX)
 161 {
 162   u.index.base = base_in;
 163   u.index.step = step_in;
 164 }
 165
 166 /* Construct a predicate that controls elements of mode ELT_MODE_IN
 167    and has PTRUE pattern PATTERN_IN.  */
 168 inline simd_immediate_info
 169 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 170                        aarch64_svpattern pattern_in)
 171   : elt_mode (elt_mode_in), insn (PTRUE)
 172 {
 173   u.pattern = pattern_in;
 174 }
 175
 176 /* The current code model.  */
 177 enum aarch64_code_model aarch64_cmodel;
 178
 179 /* The number of 64-bit elements in an SVE vector.  */
 180 poly_uint16 aarch64_sve_vg;
 181
 182 #ifdef HAVE_AS_TLS
 183 #undef TARGET_HAVE_TLS
 184 #define TARGET_HAVE_TLS 1
 185 #endif
 186
 187 static bool aarch64_composite_type_p (const_tree, machine_mode);
 188 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 189                                                      const_tree,
 190                                                      machine_mode *, int *,
 191                                                      bool *);
 192 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 193 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 194 static void aarch64_override_options_after_change (void);
 195 static bool aarch64_vector_mode_supported_p (machine_mode);
 196 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 197 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 198                                                          const_tree type,
 199                                                          int misalignment,
 200                                                          bool is_packed);
 201 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 202 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 203                                             aarch64_addr_query_type);
 204 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
 205
 206 /* Major revision number of the ARM Architecture implemented by the target.  */
 207 unsigned aarch64_architecture_version;
 208
 209 /* The processor for which instructions should be scheduled.  */
 210 enum aarch64_processor aarch64_tune = cortexa53;
 211
 212 /* Mask to specify which instruction scheduling options should be used.  */
 213 uint64_t aarch64_tune_flags = 0;
 214
 215 /* Global flag for PC relative loads.  */
 216 bool aarch64_pcrelative_literal_loads;
 217
 218 /* Global flag for whether frame pointer is enabled.  */
 219 bool aarch64_use_frame_pointer;
 220
 221 #define BRANCH_PROTECT_STR_MAX 255
 222 char *accepted_branch_protection_string = NULL;
 223
 224 static enum aarch64_parse_opt_result
 225 aarch64_parse_branch_protection (const char*, char**);
 226
 227 /* Support for command line parsing of boolean flags in the tuning
 228    structures.  */
 229 struct aarch64_flag_desc
 230 {
 231   const char* name;
 232   unsigned int flag;
 233 };
 234
 235 #define AARCH64_FUSION_PAIR(name, internal_name) \
 236   { name, AARCH64_FUSE_##internal_name },
 237 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 238 {
 239   { "none", AARCH64_FUSE_NOTHING },
 240 #include "aarch64-fusion-pairs.def"
 241   { "all", AARCH64_FUSE_ALL },
 242   { NULL, AARCH64_FUSE_NOTHING }
 243 };
 244
 245 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 246   { name, AARCH64_EXTRA_TUNE_##internal_name },
 247 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 248 {
 249   { "none", AARCH64_EXTRA_TUNE_NONE },
 250 #include "aarch64-tuning-flags.def"
 251   { "all", AARCH64_EXTRA_TUNE_ALL },
 252   { NULL, AARCH64_EXTRA_TUNE_NONE }
 253 };
 254
 255 /* Tuning parameters.  */
 256
 257 static const struct cpu_addrcost_table generic_addrcost_table =
 258 {
 259     {
 260       1, /* hi  */
 261       0, /* si  */
 262       0, /* di  */
 263       1, /* ti  */
 264     },
 265   0, /* pre_modify  */
 266   0, /* post_modify  */
 267   0, /* register_offset  */
 268   0, /* register_sextend  */
 269   0, /* register_zextend  */
 270   0 /* imm_offset  */
 271 };
 272
 273 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 274 {
 275     {
 276       0, /* hi  */
 277       0, /* si  */
 278       0, /* di  */
 279       2, /* ti  */
 280     },
 281   0, /* pre_modify  */
 282   0, /* post_modify  */
 283   1, /* register_offset  */
 284   1, /* register_sextend  */
 285   2, /* register_zextend  */
 286   0, /* imm_offset  */
 287 };
 288
 289 static const struct cpu_addrcost_table xgene1_addrcost_table =
 290 {
 291     {
 292       1, /* hi  */
 293       0, /* si  */
 294       0, /* di  */
 295       1, /* ti  */
 296     },
 297   1, /* pre_modify  */
 298   1, /* post_modify  */
 299   0, /* register_offset  */
 300   1, /* register_sextend  */
 301   1, /* register_zextend  */
 302   0, /* imm_offset  */
 303 };
 304
 305 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 306 {
 307     {
 308       1, /* hi  */
 309       1, /* si  */
 310       1, /* di  */
 311       2, /* ti  */
 312     },
 313   0, /* pre_modify  */
 314   0, /* post_modify  */
 315   2, /* register_offset  */
 316   3, /* register_sextend  */
 317   3, /* register_zextend  */
 318   0, /* imm_offset  */
 319 };
 320
 321 static const struct cpu_addrcost_table tsv110_addrcost_table =
 322 {
 323     {
 324       1, /* hi  */
 325       0, /* si  */
 326       0, /* di  */
 327       1, /* ti  */
 328     },
 329   0, /* pre_modify  */
 330   0, /* post_modify  */
 331   0, /* register_offset  */
 332   1, /* register_sextend  */
 333   1, /* register_zextend  */
 334   0, /* imm_offset  */
 335 };
 336
 337 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 338 {
 339     {
 340       1, /* hi  */
 341       1, /* si  */
 342       1, /* di  */
 343       2, /* ti  */
 344     },
 345   1, /* pre_modify  */
 346   1, /* post_modify  */
 347   3, /* register_offset  */
 348   3, /* register_sextend  */
 349   3, /* register_zextend  */
 350   2, /* imm_offset  */
 351 };
 352
 353 static const struct cpu_regmove_cost generic_regmove_cost =
 354 {
 355   1, /* GP2GP  */
 356   /* Avoid the use of slow int<->fp moves for spilling by setting
 357      their cost higher than memmov_cost.  */
 358   5, /* GP2FP  */
 359   5, /* FP2GP  */
 360   2 /* FP2FP  */
 361 };
 362
 363 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 364 {
 365   1, /* GP2GP  */
 366   /* Avoid the use of slow int<->fp moves for spilling by setting
 367      their cost higher than memmov_cost.  */
 368   5, /* GP2FP  */
 369   5, /* FP2GP  */
 370   2 /* FP2FP  */
 371 };
 372
 373 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 374 {
 375   1, /* GP2GP  */
 376   /* Avoid the use of slow int<->fp moves for spilling by setting
 377      their cost higher than memmov_cost.  */
 378   5, /* GP2FP  */
 379   5, /* FP2GP  */
 380   2 /* FP2FP  */
 381 };
 382
 383 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 384 {
 385   1, /* GP2GP  */
 386   /* Avoid the use of slow int<->fp moves for spilling by setting
 387      their cost higher than memmov_cost (actual, 4 and 9).  */
 388   9, /* GP2FP  */
 389   9, /* FP2GP  */
 390   1 /* FP2FP  */
 391 };
 392
 393 static const struct cpu_regmove_cost thunderx_regmove_cost =
 394 {
 395   2, /* GP2GP  */
 396   2, /* GP2FP  */
 397   6, /* FP2GP  */
 398   4 /* FP2FP  */
 399 };
 400
 401 static const struct cpu_regmove_cost xgene1_regmove_cost =
 402 {
 403   1, /* GP2GP  */
 404   /* Avoid the use of slow int<->fp moves for spilling by setting
 405      their cost higher than memmov_cost.  */
 406   8, /* GP2FP  */
 407   8, /* FP2GP  */
 408   2 /* FP2FP  */
 409 };
 410
 411 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 412 {
 413   2, /* GP2GP  */
 414   /* Avoid the use of int<->fp moves for spilling.  */
 415   6, /* GP2FP  */
 416   6, /* FP2GP  */
 417   4 /* FP2FP  */
 418 };
 419
 420 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 421 {
 422   1, /* GP2GP  */
 423   /* Avoid the use of int<->fp moves for spilling.  */
 424   8, /* GP2FP  */
 425   8, /* FP2GP  */
 426   4  /* FP2FP  */
 427 };
 428
 429 static const struct cpu_regmove_cost tsv110_regmove_cost =
 430 {
 431   1, /* GP2GP  */
 432   /* Avoid the use of slow int<->fp moves for spilling by setting
 433      their cost higher than memmov_cost.  */
 434   2, /* GP2FP  */
 435   3, /* FP2GP  */
 436   2  /* FP2FP  */
 437 };
 438
 439 /* Generic costs for vector insn classes.  */
 440 static const struct cpu_vector_cost generic_vector_cost =
 441 {
 442   1, /* scalar_int_stmt_cost  */
 443   1, /* scalar_fp_stmt_cost  */
 444   1, /* scalar_load_cost  */
 445   1, /* scalar_store_cost  */
 446   1, /* vec_int_stmt_cost  */
 447   1, /* vec_fp_stmt_cost  */
 448   2, /* vec_permute_cost  */
 449   1, /* vec_to_scalar_cost  */
 450   1, /* scalar_to_vec_cost  */
 451   1, /* vec_align_load_cost  */
 452   1, /* vec_unalign_load_cost  */
 453   1, /* vec_unalign_store_cost  */
 454   1, /* vec_store_cost  */
 455   3, /* cond_taken_branch_cost  */
 456   1 /* cond_not_taken_branch_cost  */
 457 };
 458
 459 /* QDF24XX costs for vector insn classes.  */
 460 static const struct cpu_vector_cost qdf24xx_vector_cost =
 461 {
 462   1, /* scalar_int_stmt_cost  */
 463   1, /* scalar_fp_stmt_cost  */
 464   1, /* scalar_load_cost  */
 465   1, /* scalar_store_cost  */
 466   1, /* vec_int_stmt_cost  */
 467   3, /* vec_fp_stmt_cost  */
 468   2, /* vec_permute_cost  */
 469   1, /* vec_to_scalar_cost  */
 470   1, /* scalar_to_vec_cost  */
 471   1, /* vec_align_load_cost  */
 472   1, /* vec_unalign_load_cost  */
 473   1, /* vec_unalign_store_cost  */
 474   1, /* vec_store_cost  */
 475   3, /* cond_taken_branch_cost  */
 476   1 /* cond_not_taken_branch_cost  */
 477 };
 478
 479 /* ThunderX costs for vector insn classes.  */
 480 static const struct cpu_vector_cost thunderx_vector_cost =
 481 {
 482   1, /* scalar_int_stmt_cost  */
 483   1, /* scalar_fp_stmt_cost  */
 484   3, /* scalar_load_cost  */
 485   1, /* scalar_store_cost  */
 486   4, /* vec_int_stmt_cost  */
 487   1, /* vec_fp_stmt_cost  */
 488   4, /* vec_permute_cost  */
 489   2, /* vec_to_scalar_cost  */
 490   2, /* scalar_to_vec_cost  */
 491   3, /* vec_align_load_cost  */
 492   5, /* vec_unalign_load_cost  */
 493   5, /* vec_unalign_store_cost  */
 494   1, /* vec_store_cost  */
 495   3, /* cond_taken_branch_cost  */
 496   3 /* cond_not_taken_branch_cost  */
 497 };
 498
 499 static const struct cpu_vector_cost tsv110_vector_cost =
 500 {
 501   1, /* scalar_int_stmt_cost  */
 502   1, /* scalar_fp_stmt_cost  */
 503   5, /* scalar_load_cost  */
 504   1, /* scalar_store_cost  */
 505   2, /* vec_int_stmt_cost  */
 506   2, /* vec_fp_stmt_cost  */
 507   2, /* vec_permute_cost  */
 508   3, /* vec_to_scalar_cost  */
 509   2, /* scalar_to_vec_cost  */
 510   5, /* vec_align_load_cost  */
 511   5, /* vec_unalign_load_cost  */
 512   1, /* vec_unalign_store_cost  */
 513   1, /* vec_store_cost  */
 514   1, /* cond_taken_branch_cost  */
 515   1 /* cond_not_taken_branch_cost  */
 516 };
 517
 518 /* Generic costs for vector insn classes.  */
 519 static const struct cpu_vector_cost cortexa57_vector_cost =
 520 {
 521   1, /* scalar_int_stmt_cost  */
 522   1, /* scalar_fp_stmt_cost  */
 523   4, /* scalar_load_cost  */
 524   1, /* scalar_store_cost  */
 525   2, /* vec_int_stmt_cost  */
 526   2, /* vec_fp_stmt_cost  */
 527   3, /* vec_permute_cost  */
 528   8, /* vec_to_scalar_cost  */
 529   8, /* scalar_to_vec_cost  */
 530   4, /* vec_align_load_cost  */
 531   4, /* vec_unalign_load_cost  */
 532   1, /* vec_unalign_store_cost  */
 533   1, /* vec_store_cost  */
 534   1, /* cond_taken_branch_cost  */
 535   1 /* cond_not_taken_branch_cost  */
 536 };
 537
 538 static const struct cpu_vector_cost exynosm1_vector_cost =
 539 {
 540   1, /* scalar_int_stmt_cost  */
 541   1, /* scalar_fp_stmt_cost  */
 542   5, /* scalar_load_cost  */
 543   1, /* scalar_store_cost  */
 544   3, /* vec_int_stmt_cost  */
 545   3, /* vec_fp_stmt_cost  */
 546   3, /* vec_permute_cost  */
 547   3, /* vec_to_scalar_cost  */
 548   3, /* scalar_to_vec_cost  */
 549   5, /* vec_align_load_cost  */
 550   5, /* vec_unalign_load_cost  */
 551   1, /* vec_unalign_store_cost  */
 552   1, /* vec_store_cost  */
 553   1, /* cond_taken_branch_cost  */
 554   1 /* cond_not_taken_branch_cost  */
 555 };
 556
 557 /* Generic costs for vector insn classes.  */
 558 static const struct cpu_vector_cost xgene1_vector_cost =
 559 {
 560   1, /* scalar_int_stmt_cost  */
 561   1, /* scalar_fp_stmt_cost  */
 562   5, /* scalar_load_cost  */
 563   1, /* scalar_store_cost  */
 564   2, /* vec_int_stmt_cost  */
 565   2, /* vec_fp_stmt_cost  */
 566   2, /* vec_permute_cost  */
 567   4, /* vec_to_scalar_cost  */
 568   4, /* scalar_to_vec_cost  */
 569   10, /* vec_align_load_cost  */
 570   10, /* vec_unalign_load_cost  */
 571   2, /* vec_unalign_store_cost  */
 572   2, /* vec_store_cost  */
 573   2, /* cond_taken_branch_cost  */
 574   1 /* cond_not_taken_branch_cost  */
 575 };
 576
 577 /* Costs for vector insn classes for Vulcan.  */
 578 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 579 {
 580   1, /* scalar_int_stmt_cost  */
 581   6, /* scalar_fp_stmt_cost  */
 582   4, /* scalar_load_cost  */
 583   1, /* scalar_store_cost  */
 584   5, /* vec_int_stmt_cost  */
 585   6, /* vec_fp_stmt_cost  */
 586   3, /* vec_permute_cost  */
 587   6, /* vec_to_scalar_cost  */
 588   5, /* scalar_to_vec_cost  */
 589   8, /* vec_align_load_cost  */
 590   8, /* vec_unalign_load_cost  */
 591   4, /* vec_unalign_store_cost  */
 592   4, /* vec_store_cost  */
 593   2, /* cond_taken_branch_cost  */
 594   1  /* cond_not_taken_branch_cost  */
 595 };
 596
 597 /* Generic costs for branch instructions.  */
 598 static const struct cpu_branch_cost generic_branch_cost =
 599 {
 600   1,  /* Predictable.  */
 601   3   /* Unpredictable.  */
 602 };
 603
 604 /* Generic approximation modes.  */
 605 static const cpu_approx_modes generic_approx_modes =
 606 {
 607   AARCH64_APPROX_NONE,  /* division  */
 608   AARCH64_APPROX_NONE,  /* sqrt  */
 609   AARCH64_APPROX_NONE   /* recip_sqrt  */
 610 };
 611
 612 /* Approximation modes for Exynos M1.  */
 613 static const cpu_approx_modes exynosm1_approx_modes =
 614 {
 615   AARCH64_APPROX_NONE,  /* division  */
 616   AARCH64_APPROX_ALL,   /* sqrt  */
 617   AARCH64_APPROX_ALL    /* recip_sqrt  */
 618 };
 619
 620 /* Approximation modes for X-Gene 1.  */
 621 static const cpu_approx_modes xgene1_approx_modes =
 622 {
 623   AARCH64_APPROX_NONE,  /* division  */
 624   AARCH64_APPROX_NONE,  /* sqrt  */
 625   AARCH64_APPROX_ALL    /* recip_sqrt  */
 626 };
 627
 628 /* Generic prefetch settings (which disable prefetch).  */
 629 static const cpu_prefetch_tune generic_prefetch_tune =
 630 {
 631   0,                    /* num_slots  */
 632   -1,                   /* l1_cache_size  */
 633   -1,                   /* l1_cache_line_size  */
 634   -1,                   /* l2_cache_size  */
 635   true,                 /* prefetch_dynamic_strides */
 636   -1,                   /* minimum_stride */
 637   -1                    /* default_opt_level  */
 638 };
 639
 640 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 641 {
 642   0,                    /* num_slots  */
 643   -1,                   /* l1_cache_size  */
 644   64,                   /* l1_cache_line_size  */
 645   -1,                   /* l2_cache_size  */
 646   true,                 /* prefetch_dynamic_strides */
 647   -1,                   /* minimum_stride */
 648   -1                    /* default_opt_level  */
 649 };
 650
 651 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 652 {
 653   4,                    /* num_slots  */
 654   32,                   /* l1_cache_size  */
 655   64,                   /* l1_cache_line_size  */
 656   512,                  /* l2_cache_size  */
 657   false,                /* prefetch_dynamic_strides */
 658   2048,                 /* minimum_stride */
 659   3                     /* default_opt_level  */
 660 };
 661
 662 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 663 {
 664   8,                    /* num_slots  */
 665   32,                   /* l1_cache_size  */
 666   128,                  /* l1_cache_line_size  */
 667   16*1024,              /* l2_cache_size  */
 668   true,                 /* prefetch_dynamic_strides */
 669   -1,                   /* minimum_stride */
 670   3                     /* default_opt_level  */
 671 };
 672
 673 static const cpu_prefetch_tune thunderx_prefetch_tune =
 674 {
 675   8,                    /* num_slots  */
 676   32,                   /* l1_cache_size  */
 677   128,                  /* l1_cache_line_size  */
 678   -1,                   /* l2_cache_size  */
 679   true,                 /* prefetch_dynamic_strides */
 680   -1,                   /* minimum_stride */
 681   -1                    /* default_opt_level  */
 682 };
 683
 684 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 685 {
 686   8,                    /* num_slots  */
 687   32,                   /* l1_cache_size  */
 688   64,                   /* l1_cache_line_size  */
 689   256,                  /* l2_cache_size  */
 690   true,                 /* prefetch_dynamic_strides */
 691   -1,                   /* minimum_stride */
 692   -1                    /* default_opt_level  */
 693 };
 694
 695 static const cpu_prefetch_tune tsv110_prefetch_tune =
 696 {
 697   0,                    /* num_slots  */
 698   64,                   /* l1_cache_size  */
 699   64,                   /* l1_cache_line_size  */
 700   512,                  /* l2_cache_size  */
 701   true,                 /* prefetch_dynamic_strides */
 702   -1,                   /* minimum_stride */
 703   -1                    /* default_opt_level  */
 704 };
 705
 706 static const cpu_prefetch_tune xgene1_prefetch_tune =
 707 {
 708   8,                    /* num_slots  */
 709   32,                   /* l1_cache_size  */
 710   64,                   /* l1_cache_line_size  */
 711   256,                  /* l2_cache_size  */
 712   true,                 /* prefetch_dynamic_strides */
 713   -1,                   /* minimum_stride */
 714   -1                    /* default_opt_level  */
 715 };
 716
 717 static const struct tune_params generic_tunings =
 718 {
 719   &cortexa57_extra_costs,
 720   &generic_addrcost_table,
 721   &generic_regmove_cost,
 722   &generic_vector_cost,
 723   &generic_branch_cost,
 724   &generic_approx_modes,
 725   SVE_NOT_IMPLEMENTED, /* sve_width  */
 726   4, /* memmov_cost  */
 727   2, /* issue_rate  */
 728   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 729   "16:12",      /* function_align.  */
 730   "4",  /* jump_align.  */
 731   "8",  /* loop_align.  */
 732   2,    /* int_reassoc_width.  */
 733   4,    /* fp_reassoc_width.  */
 734   1,    /* vec_reassoc_width.  */
 735   2,    /* min_div_recip_mul_sf.  */
 736   2,    /* min_div_recip_mul_df.  */
 737   0,    /* max_case_values.  */
 738   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 739   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 740   &generic_prefetch_tune
 741 };
 742
 743 static const struct tune_params cortexa35_tunings =
 744 {
 745   &cortexa53_extra_costs,
 746   &generic_addrcost_table,
 747   &cortexa53_regmove_cost,
 748   &generic_vector_cost,
 749   &generic_branch_cost,
 750   &generic_approx_modes,
 751   SVE_NOT_IMPLEMENTED, /* sve_width  */
 752   4, /* memmov_cost  */
 753   1, /* issue_rate  */
 754   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 755    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 756   "16", /* function_align.  */
 757   "4",  /* jump_align.  */
 758   "8",  /* loop_align.  */
 759   2,    /* int_reassoc_width.  */
 760   4,    /* fp_reassoc_width.  */
 761   1,    /* vec_reassoc_width.  */
 762   2,    /* min_div_recip_mul_sf.  */
 763   2,    /* min_div_recip_mul_df.  */
 764   0,    /* max_case_values.  */
 765   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 766   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 767   &generic_prefetch_tune
 768 };
 769
 770 static const struct tune_params cortexa53_tunings =
 771 {
 772   &cortexa53_extra_costs,
 773   &generic_addrcost_table,
 774   &cortexa53_regmove_cost,
 775   &generic_vector_cost,
 776   &generic_branch_cost,
 777   &generic_approx_modes,
 778   SVE_NOT_IMPLEMENTED, /* sve_width  */
 779   4, /* memmov_cost  */
 780   2, /* issue_rate  */
 781   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 782    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 783   "16", /* function_align.  */
 784   "4",  /* jump_align.  */
 785   "8",  /* loop_align.  */
 786   2,    /* int_reassoc_width.  */
 787   4,    /* fp_reassoc_width.  */
 788   1,    /* vec_reassoc_width.  */
 789   2,    /* min_div_recip_mul_sf.  */
 790   2,    /* min_div_recip_mul_df.  */
 791   0,    /* max_case_values.  */
 792   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 793   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 794   &generic_prefetch_tune
 795 };
 796
 797 static const struct tune_params cortexa57_tunings =
 798 {
 799   &cortexa57_extra_costs,
 800   &generic_addrcost_table,
 801   &cortexa57_regmove_cost,
 802   &cortexa57_vector_cost,
 803   &generic_branch_cost,
 804   &generic_approx_modes,
 805   SVE_NOT_IMPLEMENTED, /* sve_width  */
 806   4, /* memmov_cost  */
 807   3, /* issue_rate  */
 808   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 809    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 810   "16", /* function_align.  */
 811   "4",  /* jump_align.  */
 812   "8",  /* loop_align.  */
 813   2,    /* int_reassoc_width.  */
 814   4,    /* fp_reassoc_width.  */
 815   1,    /* vec_reassoc_width.  */
 816   2,    /* min_div_recip_mul_sf.  */
 817   2,    /* min_div_recip_mul_df.  */
 818   0,    /* max_case_values.  */
 819   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 820   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 821   &generic_prefetch_tune
 822 };
 823
 824 static const struct tune_params cortexa72_tunings =
 825 {
 826   &cortexa57_extra_costs,
 827   &generic_addrcost_table,
 828   &cortexa57_regmove_cost,
 829   &cortexa57_vector_cost,
 830   &generic_branch_cost,
 831   &generic_approx_modes,
 832   SVE_NOT_IMPLEMENTED, /* sve_width  */
 833   4, /* memmov_cost  */
 834   3, /* issue_rate  */
 835   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 836    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 837   "16", /* function_align.  */
 838   "4",  /* jump_align.  */
 839   "8",  /* loop_align.  */
 840   2,    /* int_reassoc_width.  */
 841   4,    /* fp_reassoc_width.  */
 842   1,    /* vec_reassoc_width.  */
 843   2,    /* min_div_recip_mul_sf.  */
 844   2,    /* min_div_recip_mul_df.  */
 845   0,    /* max_case_values.  */
 846   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 847   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 848   &generic_prefetch_tune
 849 };
 850
 851 static const struct tune_params cortexa73_tunings =
 852 {
 853   &cortexa57_extra_costs,
 854   &generic_addrcost_table,
 855   &cortexa57_regmove_cost,
 856   &cortexa57_vector_cost,
 857   &generic_branch_cost,
 858   &generic_approx_modes,
 859   SVE_NOT_IMPLEMENTED, /* sve_width  */
 860   4, /* memmov_cost.  */
 861   2, /* issue_rate.  */
 862   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 863    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 864   "16", /* function_align.  */
 865   "4",  /* jump_align.  */
 866   "8",  /* loop_align.  */
 867   2,    /* int_reassoc_width.  */
 868   4,    /* fp_reassoc_width.  */
 869   1,    /* vec_reassoc_width.  */
 870   2,    /* min_div_recip_mul_sf.  */
 871   2,    /* min_div_recip_mul_df.  */
 872   0,    /* max_case_values.  */
 873   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 874   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 875   &generic_prefetch_tune
 876 };
 877
 878
 879
 880 static const struct tune_params exynosm1_tunings =
 881 {
 882   &exynosm1_extra_costs,
 883   &exynosm1_addrcost_table,
 884   &exynosm1_regmove_cost,
 885   &exynosm1_vector_cost,
 886   &generic_branch_cost,
 887   &exynosm1_approx_modes,
 888   SVE_NOT_IMPLEMENTED, /* sve_width  */
 889   4,    /* memmov_cost  */
 890   3,    /* issue_rate  */
 891   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 892   "4",  /* function_align.  */
 893   "4",  /* jump_align.  */
 894   "4",  /* loop_align.  */
 895   2,    /* int_reassoc_width.  */
 896   4,    /* fp_reassoc_width.  */
 897   1,    /* vec_reassoc_width.  */
 898   2,    /* min_div_recip_mul_sf.  */
 899   2,    /* min_div_recip_mul_df.  */
 900   48,   /* max_case_values.  */
 901   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 902   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 903   &exynosm1_prefetch_tune
 904 };
 905
 906 static const struct tune_params thunderxt88_tunings =
 907 {
 908   &thunderx_extra_costs,
 909   &generic_addrcost_table,
 910   &thunderx_regmove_cost,
 911   &thunderx_vector_cost,
 912   &generic_branch_cost,
 913   &generic_approx_modes,
 914   SVE_NOT_IMPLEMENTED, /* sve_width  */
 915   6, /* memmov_cost  */
 916   2, /* issue_rate  */
 917   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 918   "8",  /* function_align.  */
 919   "8",  /* jump_align.  */
 920   "8",  /* loop_align.  */
 921   2,    /* int_reassoc_width.  */
 922   4,    /* fp_reassoc_width.  */
 923   1,    /* vec_reassoc_width.  */
 924   2,    /* min_div_recip_mul_sf.  */
 925   2,    /* min_div_recip_mul_df.  */
 926   0,    /* max_case_values.  */
 927   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 928   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 929   &thunderxt88_prefetch_tune
 930 };
 931
 932 static const struct tune_params thunderx_tunings =
 933 {
 934   &thunderx_extra_costs,
 935   &generic_addrcost_table,
 936   &thunderx_regmove_cost,
 937   &thunderx_vector_cost,
 938   &generic_branch_cost,
 939   &generic_approx_modes,
 940   SVE_NOT_IMPLEMENTED, /* sve_width  */
 941   6, /* memmov_cost  */
 942   2, /* issue_rate  */
 943   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 944   "8",  /* function_align.  */
 945   "8",  /* jump_align.  */
 946   "8",  /* loop_align.  */
 947   2,    /* int_reassoc_width.  */
 948   4,    /* fp_reassoc_width.  */
 949   1,    /* vec_reassoc_width.  */
 950   2,    /* min_div_recip_mul_sf.  */
 951   2,    /* min_div_recip_mul_df.  */
 952   0,    /* max_case_values.  */
 953   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 954   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 955    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 956   &thunderx_prefetch_tune
 957 };
 958
 959 static const struct tune_params tsv110_tunings =
 960 {
 961   &tsv110_extra_costs,
 962   &tsv110_addrcost_table,
 963   &tsv110_regmove_cost,
 964   &tsv110_vector_cost,
 965   &generic_branch_cost,
 966   &generic_approx_modes,
 967   SVE_NOT_IMPLEMENTED, /* sve_width  */
 968   4,    /* memmov_cost  */
 969   4,    /* issue_rate  */
 970   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
 971    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 972   "16", /* function_align.  */
 973   "4",  /* jump_align.  */
 974   "8",  /* loop_align.  */
 975   2,    /* int_reassoc_width.  */
 976   4,    /* fp_reassoc_width.  */
 977   1,    /* vec_reassoc_width.  */
 978   2,    /* min_div_recip_mul_sf.  */
 979   2,    /* min_div_recip_mul_df.  */
 980   0,    /* max_case_values.  */
 981   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 982   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
 983   &tsv110_prefetch_tune
 984 };
 985
 986 static const struct tune_params xgene1_tunings =
 987 {
 988   &xgene1_extra_costs,
 989   &xgene1_addrcost_table,
 990   &xgene1_regmove_cost,
 991   &xgene1_vector_cost,
 992   &generic_branch_cost,
 993   &xgene1_approx_modes,
 994   SVE_NOT_IMPLEMENTED, /* sve_width  */
 995   6, /* memmov_cost  */
 996   4, /* issue_rate  */
 997   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 998   "16", /* function_align.  */
 999   "16", /* jump_align.  */
1000   "16", /* loop_align.  */
1001   2,    /* int_reassoc_width.  */
1002   4,    /* fp_reassoc_width.  */
1003   1,    /* vec_reassoc_width.  */
1004   2,    /* min_div_recip_mul_sf.  */
1005   2,    /* min_div_recip_mul_df.  */
1006   17,   /* max_case_values.  */
1007   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1008   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1009   &xgene1_prefetch_tune
1010 };
1011
1012 static const struct tune_params emag_tunings =
1013 {
1014   &xgene1_extra_costs,
1015   &xgene1_addrcost_table,
1016   &xgene1_regmove_cost,
1017   &xgene1_vector_cost,
1018   &generic_branch_cost,
1019   &xgene1_approx_modes,
1020   SVE_NOT_IMPLEMENTED,
1021   6, /* memmov_cost  */
1022   4, /* issue_rate  */
1023   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1024   "16", /* function_align.  */
1025   "16", /* jump_align.  */
1026   "16", /* loop_align.  */
1027   2,    /* int_reassoc_width.  */
1028   4,    /* fp_reassoc_width.  */
1029   1,    /* vec_reassoc_width.  */
1030   2,    /* min_div_recip_mul_sf.  */
1031   2,    /* min_div_recip_mul_df.  */
1032   17,   /* max_case_values.  */
1033   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1034   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1035   &xgene1_prefetch_tune
1036 };
1037
1038 static const struct tune_params qdf24xx_tunings =
1039 {
1040   &qdf24xx_extra_costs,
1041   &qdf24xx_addrcost_table,
1042   &qdf24xx_regmove_cost,
1043   &qdf24xx_vector_cost,
1044   &generic_branch_cost,
1045   &generic_approx_modes,
1046   SVE_NOT_IMPLEMENTED, /* sve_width  */
1047   4, /* memmov_cost  */
1048   4, /* issue_rate  */
1049   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1050    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1051   "16", /* function_align.  */
1052   "8",  /* jump_align.  */
1053   "16", /* loop_align.  */
1054   2,    /* int_reassoc_width.  */
1055   4,    /* fp_reassoc_width.  */
1056   1,    /* vec_reassoc_width.  */
1057   2,    /* min_div_recip_mul_sf.  */
1058   2,    /* min_div_recip_mul_df.  */
1059   0,    /* max_case_values.  */
1060   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1061   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1062   &qdf24xx_prefetch_tune
1063 };
1064
1065 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1066    for now.  */
1067 static const struct tune_params saphira_tunings =
1068 {
1069   &generic_extra_costs,
1070   &generic_addrcost_table,
1071   &generic_regmove_cost,
1072   &generic_vector_cost,
1073   &generic_branch_cost,
1074   &generic_approx_modes,
1075   SVE_NOT_IMPLEMENTED, /* sve_width  */
1076   4, /* memmov_cost  */
1077   4, /* issue_rate  */
1078   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1079    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1080   "16", /* function_align.  */
1081   "8",  /* jump_align.  */
1082   "16", /* loop_align.  */
1083   2,    /* int_reassoc_width.  */
1084   4,    /* fp_reassoc_width.  */
1085   1,    /* vec_reassoc_width.  */
1086   2,    /* min_div_recip_mul_sf.  */
1087   2,    /* min_div_recip_mul_df.  */
1088   0,    /* max_case_values.  */
1089   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1090   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1091   &generic_prefetch_tune
1092 };
1093
1094 static const struct tune_params thunderx2t99_tunings =
1095 {
1096   &thunderx2t99_extra_costs,
1097   &thunderx2t99_addrcost_table,
1098   &thunderx2t99_regmove_cost,
1099   &thunderx2t99_vector_cost,
1100   &generic_branch_cost,
1101   &generic_approx_modes,
1102   SVE_NOT_IMPLEMENTED, /* sve_width  */
1103   4, /* memmov_cost.  */
1104   4, /* issue_rate.  */
1105   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1106    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
1107   "16", /* function_align.  */
1108   "8",  /* jump_align.  */
1109   "16", /* loop_align.  */
1110   3,    /* int_reassoc_width.  */
1111   2,    /* fp_reassoc_width.  */
1112   2,    /* vec_reassoc_width.  */
1113   2,    /* min_div_recip_mul_sf.  */
1114   2,    /* min_div_recip_mul_df.  */
1115   0,    /* max_case_values.  */
1116   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1117   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1118   &thunderx2t99_prefetch_tune
1119 };
1120
1121 static const struct tune_params neoversen1_tunings =
1122 {
1123   &cortexa57_extra_costs,
1124   &generic_addrcost_table,
1125   &generic_regmove_cost,
1126   &cortexa57_vector_cost,
1127   &generic_branch_cost,
1128   &generic_approx_modes,
1129   SVE_NOT_IMPLEMENTED, /* sve_width  */
1130   4, /* memmov_cost  */
1131   3, /* issue_rate  */
1132   AARCH64_FUSE_AES_AESMC, /* fusible_ops  */
1133   "32:16",      /* function_align.  */
1134   "32:16",      /* jump_align.  */
1135   "32:16",      /* loop_align.  */
1136   2,    /* int_reassoc_width.  */
1137   4,    /* fp_reassoc_width.  */
1138   2,    /* vec_reassoc_width.  */
1139   2,    /* min_div_recip_mul_sf.  */
1140   2,    /* min_div_recip_mul_df.  */
1141   0,    /* max_case_values.  */
1142   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1143   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1144   &generic_prefetch_tune
1145 };
1146
1147 /* Support for fine-grained override of the tuning structures.  */
1148 struct aarch64_tuning_override_function
1149 {
1150   const char* name;
1151   void (*parse_override)(const char*, struct tune_params*);
1152 };
1153
1154 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1155 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1156 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1157
1158 static const struct aarch64_tuning_override_function
1159 aarch64_tuning_override_functions[] =
1160 {
1161   { "fuse", aarch64_parse_fuse_string },
1162   { "tune", aarch64_parse_tune_string },
1163   { "sve_width", aarch64_parse_sve_width_string },
1164   { NULL, NULL }
1165 };
1166
1167 /* A processor implementing AArch64.  */
1168 struct processor
1169 {
1170   const char *const name;
1171   enum aarch64_processor ident;
1172   enum aarch64_processor sched_core;
1173   enum aarch64_arch arch;
1174   unsigned architecture_version;
1175   const uint64_t flags;
1176   const struct tune_params *const tune;
1177 };
1178
1179 /* Architectures implementing AArch64.  */
1180 static const struct processor all_architectures[] =
1181 {
1182 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1183   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1184 #include "aarch64-arches.def"
1185   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1186 };
1187
1188 /* Processor cores implementing AArch64.  */
1189 static const struct processor all_cores[] =
1190 {
1191 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1192   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
1193   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
1194   FLAGS, &COSTS##_tunings},
1195 #include "aarch64-cores.def"
1196   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1197     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1198   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1199 };
1200
1201
1202 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1203    handling code or by target attributes.  */
1204 static const struct processor *selected_arch;
1205 static const struct processor *selected_cpu;
1206 static const struct processor *selected_tune;
1207
1208 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1209
1210 /* The current tuning set.  */
1211 struct tune_params aarch64_tune_params = generic_tunings;
1212
1213 /* Table of machine attributes.  */
1214 static const struct attribute_spec aarch64_attribute_table[] =
1215 {
1216   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1217        affects_type_identity, handler, exclude } */
1218   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,  NULL, NULL },
1219   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
1220 };
1221
1222 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1223
1224 /* An ISA extension in the co-processor and main instruction set space.  */
1225 struct aarch64_option_extension
1226 {
1227   const char *const name;
1228   const unsigned long flags_on;
1229   const unsigned long flags_off;
1230 };
1231
1232 typedef enum aarch64_cond_code
1233 {
1234   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1235   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1236   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1237 }
1238 aarch64_cc;
1239
1240 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1241
1242 struct aarch64_branch_protect_type
1243 {
1244   /* The type's name that the user passes to the branch-protection option
1245     string.  */
1246   const char* name;
1247   /* Function to handle the protection type and set global variables.
1248     First argument is the string token corresponding with this type and the
1249     second argument is the next token in the option string.
1250     Return values:
1251     * AARCH64_PARSE_OK: Handling was sucessful.
1252     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1253       should print an error.
1254     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1255       own error.  */
1256   enum aarch64_parse_opt_result (*handler)(char*, char*);
1257   /* A list of types that can follow this type in the option string.  */
1258   const aarch64_branch_protect_type* subtypes;
1259   unsigned int num_subtypes;
1260 };
1261
1262 static enum aarch64_parse_opt_result
1263 aarch64_handle_no_branch_protection (char* str, char* rest)
1264 {
1265   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1266   aarch64_enable_bti = 0;
1267   if (rest)
1268     {
1269       error ("unexpected %<%s%> after %<%s%>", rest, str);
1270       return AARCH64_PARSE_INVALID_FEATURE;
1271     }
1272   return AARCH64_PARSE_OK;
1273 }
1274
1275 static enum aarch64_parse_opt_result
1276 aarch64_handle_standard_branch_protection (char* str, char* rest)
1277 {
1278   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1279   aarch64_ra_sign_key = AARCH64_KEY_A;
1280   aarch64_enable_bti = 1;
1281   if (rest)
1282     {
1283       error ("unexpected %<%s%> after %<%s%>", rest, str);
1284       return AARCH64_PARSE_INVALID_FEATURE;
1285     }
1286   return AARCH64_PARSE_OK;
1287 }
1288
1289 static enum aarch64_parse_opt_result
1290 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1291                                     char* rest ATTRIBUTE_UNUSED)
1292 {
1293   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1294   aarch64_ra_sign_key = AARCH64_KEY_A;
1295   return AARCH64_PARSE_OK;
1296 }
1297
1298 static enum aarch64_parse_opt_result
1299 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1300                               char* rest ATTRIBUTE_UNUSED)
1301 {
1302   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1303   return AARCH64_PARSE_OK;
1304 }
1305
1306 static enum aarch64_parse_opt_result
1307 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1308                               char* rest ATTRIBUTE_UNUSED)
1309 {
1310   aarch64_ra_sign_key = AARCH64_KEY_B;
1311   return AARCH64_PARSE_OK;
1312 }
1313
1314 static enum aarch64_parse_opt_result
1315 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1316                                     char* rest ATTRIBUTE_UNUSED)
1317 {
1318   aarch64_enable_bti = 1;
1319   return AARCH64_PARSE_OK;
1320 }
1321
1322 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1323   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1324   { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1325   { NULL, NULL, NULL, 0 }
1326 };
1327
1328 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1329   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1330   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1331   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1332     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1333   { "bti", aarch64_handle_bti_protection, NULL, 0 },
1334   { NULL, NULL, NULL, 0 }
1335 };
1336
1337 /* The condition codes of the processor, and the inverse function.  */
1338 static const char * const aarch64_condition_codes[] =
1339 {
1340   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1341   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1342 };
1343
1344 /* The preferred condition codes for SVE conditions.  */
1345 static const char *const aarch64_sve_condition_codes[] =
1346 {
1347   "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1348   "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1349 };
1350
1351 /* Return the assembly token for svpattern value VALUE.  */
1352
1353 static const char *
1354 svpattern_token (enum aarch64_svpattern pattern)
1355 {
1356   switch (pattern)
1357     {
1358 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1359     AARCH64_FOR_SVPATTERN (CASE)
1360 #undef CASE
1361     case AARCH64_NUM_SVPATTERNS:
1362       break;
1363     }
1364   gcc_unreachable ();
1365 }
1366
1367 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1368 const char *
1369 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1370                         const char * branch_format)
1371 {
1372     rtx_code_label * tmp_label = gen_label_rtx ();
1373     char label_buf[256];
1374     char buffer[128];
1375     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1376                                  CODE_LABEL_NUMBER (tmp_label));
1377     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1378     rtx dest_label = operands[pos_label];
1379     operands[pos_label] = tmp_label;
1380
1381     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1382     output_asm_insn (buffer, operands);
1383
1384     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1385     operands[pos_label] = dest_label;
1386     output_asm_insn (buffer, operands);
1387     return "";
1388 }
1389
1390 void
1391 aarch64_err_no_fpadvsimd (machine_mode mode)
1392 {
1393   if (TARGET_GENERAL_REGS_ONLY)
1394     if (FLOAT_MODE_P (mode))
1395       error ("%qs is incompatible with the use of floating-point types",
1396              "-mgeneral-regs-only");
1397     else
1398       error ("%qs is incompatible with the use of vector types",
1399              "-mgeneral-regs-only");
1400   else
1401     if (FLOAT_MODE_P (mode))
1402       error ("%qs feature modifier is incompatible with the use of"
1403              " floating-point types", "+nofp");
1404     else
1405       error ("%qs feature modifier is incompatible with the use of"
1406              " vector types", "+nofp");
1407 }
1408
1409 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1410    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1411    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1412    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1413    and GENERAL_REGS is lower than the memory cost (in this case the best class
1414    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1415    cost results in bad allocations with many redundant int<->FP moves which
1416    are expensive on various cores.
1417    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1418    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1419    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1420    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1421    The result of this is that it is no longer inefficient to have a higher
1422    memory move cost than the register move cost.
1423 */
1424
1425 static reg_class_t
1426 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1427                                          reg_class_t best_class)
1428 {
1429   machine_mode mode;
1430
1431   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1432       || !reg_class_subset_p (FP_REGS, allocno_class))
1433     return allocno_class;
1434
1435   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1436       || !reg_class_subset_p (FP_REGS, best_class))
1437     return best_class;
1438
1439   mode = PSEUDO_REGNO_MODE (regno);
1440   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1441 }
1442
1443 static unsigned int
1444 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1445 {
1446   if (GET_MODE_UNIT_SIZE (mode) == 4)
1447     return aarch64_tune_params.min_div_recip_mul_sf;
1448   return aarch64_tune_params.min_div_recip_mul_df;
1449 }
1450
1451 /* Return the reassociation width of treeop OPC with mode MODE.  */
1452 static int
1453 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1454 {
1455   if (VECTOR_MODE_P (mode))
1456     return aarch64_tune_params.vec_reassoc_width;
1457   if (INTEGRAL_MODE_P (mode))
1458     return aarch64_tune_params.int_reassoc_width;
1459   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1460   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1461     return aarch64_tune_params.fp_reassoc_width;
1462   return 1;
1463 }
1464
1465 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1466 unsigned
1467 aarch64_dbx_register_number (unsigned regno)
1468 {
1469    if (GP_REGNUM_P (regno))
1470      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1471    else if (regno == SP_REGNUM)
1472      return AARCH64_DWARF_SP;
1473    else if (FP_REGNUM_P (regno))
1474      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1475    else if (PR_REGNUM_P (regno))
1476      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1477    else if (regno == VG_REGNUM)
1478      return AARCH64_DWARF_VG;
1479
1480    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1481       equivalent DWARF register.  */
1482    return DWARF_FRAME_REGISTERS;
1483 }
1484
1485 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1486    integer, otherwise return X unmodified.  */
1487 static rtx
1488 aarch64_bit_representation (rtx x)
1489 {
1490   if (CONST_DOUBLE_P (x))
1491     x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
1492   return x;
1493 }
1494
1495 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1496 static bool
1497 aarch64_advsimd_struct_mode_p (machine_mode mode)
1498 {
1499   return (TARGET_SIMD
1500           && (mode == OImode || mode == CImode || mode == XImode));
1501 }
1502
1503 /* Return true if MODE is an SVE predicate mode.  */
1504 static bool
1505 aarch64_sve_pred_mode_p (machine_mode mode)
1506 {
1507   return (TARGET_SVE
1508           && (mode == VNx16BImode
1509               || mode == VNx8BImode
1510               || mode == VNx4BImode
1511               || mode == VNx2BImode));
1512 }
1513
1514 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1515 const unsigned int VEC_ADVSIMD  = 1;
1516 const unsigned int VEC_SVE_DATA = 2;
1517 const unsigned int VEC_SVE_PRED = 4;
1518 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1519    a structure of 2, 3 or 4 vectors.  */
1520 const unsigned int VEC_STRUCT   = 8;
1521 /* Useful combinations of the above.  */
1522 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1523 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1524
1525 /* Return a set of flags describing the vector properties of mode MODE.
1526    Ignore modes that are not supported by the current target.  */
1527 static unsigned int
1528 aarch64_classify_vector_mode (machine_mode mode)
1529 {
1530   if (aarch64_advsimd_struct_mode_p (mode))
1531     return VEC_ADVSIMD | VEC_STRUCT;
1532
1533   if (aarch64_sve_pred_mode_p (mode))
1534     return VEC_SVE_PRED;
1535
1536   /* Make the decision based on the mode's enum value rather than its
1537      properties, so that we keep the correct classification regardless
1538      of -msve-vector-bits.  */
1539   switch (mode)
1540     {
1541     /* Single SVE vectors.  */
1542     case E_VNx16QImode:
1543     case E_VNx8HImode:
1544     case E_VNx4SImode:
1545     case E_VNx2DImode:
1546     case E_VNx8HFmode:
1547     case E_VNx4SFmode:
1548     case E_VNx2DFmode:
1549       return TARGET_SVE ? VEC_SVE_DATA : 0;
1550
1551     /* x2 SVE vectors.  */
1552     case E_VNx32QImode:
1553     case E_VNx16HImode:
1554     case E_VNx8SImode:
1555     case E_VNx4DImode:
1556     case E_VNx16HFmode:
1557     case E_VNx8SFmode:
1558     case E_VNx4DFmode:
1559     /* x3 SVE vectors.  */
1560     case E_VNx48QImode:
1561     case E_VNx24HImode:
1562     case E_VNx12SImode:
1563     case E_VNx6DImode:
1564     case E_VNx24HFmode:
1565     case E_VNx12SFmode:
1566     case E_VNx6DFmode:
1567     /* x4 SVE vectors.  */
1568     case E_VNx64QImode:
1569     case E_VNx32HImode:
1570     case E_VNx16SImode:
1571     case E_VNx8DImode:
1572     case E_VNx32HFmode:
1573     case E_VNx16SFmode:
1574     case E_VNx8DFmode:
1575       return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
1576
1577     /* 64-bit Advanced SIMD vectors.  */
1578     case E_V8QImode:
1579     case E_V4HImode:
1580     case E_V2SImode:
1581     /* ...E_V1DImode doesn't exist.  */
1582     case E_V4HFmode:
1583     case E_V2SFmode:
1584     case E_V1DFmode:
1585     /* 128-bit Advanced SIMD vectors.  */
1586     case E_V16QImode:
1587     case E_V8HImode:
1588     case E_V4SImode:
1589     case E_V2DImode:
1590     case E_V8HFmode:
1591     case E_V4SFmode:
1592     case E_V2DFmode:
1593       return TARGET_SIMD ? VEC_ADVSIMD : 0;
1594
1595     default:
1596       return 0;
1597     }
1598 }
1599
1600 /* Return true if MODE is any of the data vector modes, including
1601    structure modes.  */
1602 static bool
1603 aarch64_vector_data_mode_p (machine_mode mode)
1604 {
1605   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1606 }
1607
1608 /* Return true if MODE is an SVE data vector mode; either a single vector
1609    or a structure of vectors.  */
1610 static bool
1611 aarch64_sve_data_mode_p (machine_mode mode)
1612 {
1613   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1614 }
1615
1616 /* Implement target hook TARGET_ARRAY_MODE.  */
1617 static opt_machine_mode
1618 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1619 {
1620   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1621       && IN_RANGE (nelems, 2, 4))
1622     return mode_for_vector (GET_MODE_INNER (mode),
1623                             GET_MODE_NUNITS (mode) * nelems);
1624
1625   return opt_machine_mode ();
1626 }
1627
1628 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1629 static bool
1630 aarch64_array_mode_supported_p (machine_mode mode,
1631                                 unsigned HOST_WIDE_INT nelems)
1632 {
1633   if (TARGET_SIMD
1634       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1635           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1636       && (nelems >= 2 && nelems <= 4))
1637     return true;
1638
1639   return false;
1640 }
1641
1642 /* Return the SVE predicate mode to use for elements that have
1643    ELEM_NBYTES bytes, if such a mode exists.  */
1644
1645 opt_machine_mode
1646 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1647 {
1648   if (TARGET_SVE)
1649     {
1650       if (elem_nbytes == 1)
1651         return VNx16BImode;
1652       if (elem_nbytes == 2)
1653         return VNx8BImode;
1654       if (elem_nbytes == 4)
1655         return VNx4BImode;
1656       if (elem_nbytes == 8)
1657         return VNx2BImode;
1658     }
1659   return opt_machine_mode ();
1660 }
1661
1662 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1663
1664 static opt_machine_mode
1665 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1666 {
1667   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1668     {
1669       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1670       machine_mode pred_mode;
1671       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1672         return pred_mode;
1673     }
1674
1675   return default_get_mask_mode (nunits, nbytes);
1676 }
1677
1678 /* Return the integer element mode associated with SVE mode MODE.  */
1679
1680 static scalar_int_mode
1681 aarch64_sve_element_int_mode (machine_mode mode)
1682 {
1683   unsigned int elt_bits = vector_element_size (BITS_PER_SVE_VECTOR,
1684                                                GET_MODE_NUNITS (mode));
1685   return int_mode_for_size (elt_bits, 0).require ();
1686 }
1687
1688 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
1689    prefer to use the first arithmetic operand as the else value if
1690    the else value doesn't matter, since that exactly matches the SVE
1691    destructive merging form.  For ternary operations we could either
1692    pick the first operand and use FMAD-like instructions or the last
1693    operand and use FMLA-like instructions; the latter seems more
1694    natural.  */
1695
1696 static tree
1697 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1698 {
1699   return nops == 3 ? ops[2] : ops[0];
1700 }
1701
1702 /* Implement TARGET_HARD_REGNO_NREGS.  */
1703
1704 static unsigned int
1705 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1706 {
1707   /* ??? Logically we should only need to provide a value when
1708      HARD_REGNO_MODE_OK says that the combination is valid,
1709      but at the moment we need to handle all modes.  Just ignore
1710      any runtime parts for registers that can't store them.  */
1711   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1712   switch (aarch64_regno_regclass (regno))
1713     {
1714     case FP_REGS:
1715     case FP_LO_REGS:
1716     case FP_LO8_REGS:
1717       if (aarch64_sve_data_mode_p (mode))
1718         return exact_div (GET_MODE_SIZE (mode),
1719                           BYTES_PER_SVE_VECTOR).to_constant ();
1720       return CEIL (lowest_size, UNITS_PER_VREG);
1721     case PR_REGS:
1722     case PR_LO_REGS:
1723     case PR_HI_REGS:
1724       return 1;
1725     default:
1726       return CEIL (lowest_size, UNITS_PER_WORD);
1727     }
1728   gcc_unreachable ();
1729 }
1730
1731 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1732
1733 static bool
1734 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1735 {
1736   if (GET_MODE_CLASS (mode) == MODE_CC)
1737     return regno == CC_REGNUM;
1738
1739   if (regno == VG_REGNUM)
1740     /* This must have the same size as _Unwind_Word.  */
1741     return mode == DImode;
1742
1743   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1744   if (vec_flags & VEC_SVE_PRED)
1745     return PR_REGNUM_P (regno);
1746
1747   if (PR_REGNUM_P (regno))
1748     return 0;
1749
1750   if (regno == SP_REGNUM)
1751     /* The purpose of comparing with ptr_mode is to support the
1752        global register variable associated with the stack pointer
1753        register via the syntax of asm ("wsp") in ILP32.  */
1754     return mode == Pmode || mode == ptr_mode;
1755
1756   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1757     return mode == Pmode;
1758
1759   if (GP_REGNUM_P (regno))
1760     {
1761       if (known_le (GET_MODE_SIZE (mode), 8))
1762         return true;
1763       else if (known_le (GET_MODE_SIZE (mode), 16))
1764         return (regno & 1) == 0;
1765     }
1766   else if (FP_REGNUM_P (regno))
1767     {
1768       if (vec_flags & VEC_STRUCT)
1769         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1770       else
1771         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1772     }
1773
1774   return false;
1775 }
1776
1777 /* Return true if this is a definition of a vectorized simd function.  */
1778
1779 static bool
1780 aarch64_simd_decl_p (tree fndecl)
1781 {
1782   tree fntype;
1783
1784   if (fndecl == NULL)
1785     return false;
1786   fntype = TREE_TYPE (fndecl);
1787   if (fntype == NULL)
1788     return false;
1789
1790   /* Functions with the aarch64_vector_pcs attribute use the simd ABI.  */
1791   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1792     return true;
1793
1794   return false;
1795 }
1796
1797 /* Return the mode a register save/restore should use.  DImode for integer
1798    registers, DFmode for FP registers in non-SIMD functions (they only save
1799    the bottom half of a 128 bit register), or TFmode for FP registers in
1800    SIMD functions.  */
1801
1802 static machine_mode
1803 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1804 {
1805   return GP_REGNUM_P (regno)
1806            ? E_DImode
1807            : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1808 }
1809
1810 /* Return true if the instruction is a call to a SIMD function, false
1811    if it is not a SIMD function or if we do not know anything about
1812    the function.  */
1813
1814 static bool
1815 aarch64_simd_call_p (rtx_insn *insn)
1816 {
1817   rtx symbol;
1818   rtx call;
1819   tree fndecl;
1820
1821   gcc_assert (CALL_P (insn));
1822   call = get_call_rtx_from (insn);
1823   symbol = XEXP (XEXP (call, 0), 0);
1824   if (GET_CODE (symbol) != SYMBOL_REF)
1825     return false;
1826   fndecl = SYMBOL_REF_DECL (symbol);
1827   if (!fndecl)
1828     return false;
1829
1830   return aarch64_simd_decl_p (fndecl);
1831 }
1832
1833 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS.  If INSN calls
1834    a function that uses the SIMD ABI, take advantage of the extra
1835    call-preserved registers that the ABI provides.  */
1836
1837 void
1838 aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
1839                                           HARD_REG_SET *return_set)
1840 {
1841   if (aarch64_simd_call_p (insn))
1842     {
1843       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1844         if (FP_SIMD_SAVED_REGNUM_P (regno))
1845           CLEAR_HARD_REG_BIT (*return_set, regno);
1846     }
1847 }
1848
1849 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1850    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1851    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1852
1853 static bool
1854 aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
1855                                         machine_mode mode)
1856 {
1857   bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
1858   return FP_REGNUM_P (regno)
1859          && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
1860 }
1861
1862 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS.  */
1863
1864 rtx_insn *
1865 aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
1866 {
1867   gcc_assert (CALL_P (call_1) && CALL_P (call_2));
1868
1869   if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
1870     return call_1;
1871   else
1872     return call_2;
1873 }
1874
1875 /* Implement REGMODE_NATURAL_SIZE.  */
1876 poly_uint64
1877 aarch64_regmode_natural_size (machine_mode mode)
1878 {
1879   /* The natural size for SVE data modes is one SVE data vector,
1880      and similarly for predicates.  We can't independently modify
1881      anything smaller than that.  */
1882   /* ??? For now, only do this for variable-width SVE registers.
1883      Doing it for constant-sized registers breaks lower-subreg.c.  */
1884   /* ??? And once that's fixed, we should probably have similar
1885      code for Advanced SIMD.  */
1886   if (!aarch64_sve_vg.is_constant ())
1887     {
1888       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1889       if (vec_flags & VEC_SVE_PRED)
1890         return BYTES_PER_SVE_PRED;
1891       if (vec_flags & VEC_SVE_DATA)
1892         return BYTES_PER_SVE_VECTOR;
1893     }
1894   return UNITS_PER_WORD;
1895 }
1896
1897 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1898 machine_mode
1899 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1900                                      machine_mode mode)
1901 {
1902   /* The predicate mode determines which bits are significant and
1903      which are "don't care".  Decreasing the number of lanes would
1904      lose data while increasing the number of lanes would make bits
1905      unnecessarily significant.  */
1906   if (PR_REGNUM_P (regno))
1907     return mode;
1908   if (known_ge (GET_MODE_SIZE (mode), 4))
1909     return mode;
1910   else
1911     return SImode;
1912 }
1913
1914 /* Return true if I's bits are consecutive ones from the MSB.  */
1915 bool
1916 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1917 {
1918   return exact_log2 (-i) != HOST_WIDE_INT_M1;
1919 }
1920
1921 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1922    that strcpy from constants will be faster.  */
1923
1924 static HOST_WIDE_INT
1925 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1926 {
1927   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1928     return MAX (align, BITS_PER_WORD);
1929   return align;
1930 }
1931
1932 /* Return true if calls to DECL should be treated as
1933    long-calls (ie called via a register).  */
1934 static bool
1935 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1936 {
1937   return false;
1938 }
1939
1940 /* Return true if calls to symbol-ref SYM should be treated as
1941    long-calls (ie called via a register).  */
1942 bool
1943 aarch64_is_long_call_p (rtx sym)
1944 {
1945   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1946 }
1947
1948 /* Return true if calls to symbol-ref SYM should not go through
1949    plt stubs.  */
1950
1951 bool
1952 aarch64_is_noplt_call_p (rtx sym)
1953 {
1954   const_tree decl = SYMBOL_REF_DECL (sym);
1955
1956   if (flag_pic
1957       && decl
1958       && (!flag_plt
1959           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1960       && !targetm.binds_local_p (decl))
1961     return true;
1962
1963   return false;
1964 }
1965
1966 /* Return true if the offsets to a zero/sign-extract operation
1967    represent an expression that matches an extend operation.  The
1968    operands represent the paramters from
1969
1970    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1971 bool
1972 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1973                                 rtx extract_imm)
1974 {
1975   HOST_WIDE_INT mult_val, extract_val;
1976
1977   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1978     return false;
1979
1980   mult_val = INTVAL (mult_imm);
1981   extract_val = INTVAL (extract_imm);
1982
1983   if (extract_val > 8
1984       && extract_val < GET_MODE_BITSIZE (mode)
1985       && exact_log2 (extract_val & ~7) > 0
1986       && (extract_val & 7) <= 4
1987       && mult_val == (1 << (extract_val & 7)))
1988     return true;
1989
1990   return false;
1991 }
1992
1993 /* Emit an insn that's a simple single-set.  Both the operands must be
1994    known to be valid.  */
1995 inline static rtx_insn *
1996 emit_set_insn (rtx x, rtx y)
1997 {
1998   return emit_insn (gen_rtx_SET (x, y));
1999 }
2000
2001 /* X and Y are two things to compare using CODE.  Emit the compare insn and
2002    return the rtx for register 0 in the proper mode.  */
2003 rtx
2004 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2005 {
2006   machine_mode mode = SELECT_CC_MODE (code, x, y);
2007   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
2008
2009   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
2010   return cc_reg;
2011 }
2012
2013 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
2014
2015 static rtx
2016 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2017                                   machine_mode y_mode)
2018 {
2019   if (y_mode == E_QImode || y_mode == E_HImode)
2020     {
2021       if (CONST_INT_P (y))
2022         y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2023       else
2024         {
2025           rtx t, cc_reg;
2026           machine_mode cc_mode;
2027
2028           t = gen_rtx_ZERO_EXTEND (SImode, y);
2029           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2030           cc_mode = CC_SWPmode;
2031           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2032           emit_set_insn (cc_reg, t);
2033           return cc_reg;
2034         }
2035     }
2036
2037   return aarch64_gen_compare_reg (code, x, y);
2038 }
2039
2040 /* Build the SYMBOL_REF for __tls_get_addr.  */
2041
2042 static GTY(()) rtx tls_get_addr_libfunc;
2043
2044 rtx
2045 aarch64_tls_get_addr (void)
2046 {
2047   if (!tls_get_addr_libfunc)
2048     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2049   return tls_get_addr_libfunc;
2050 }
2051
2052 /* Return the TLS model to use for ADDR.  */
2053
2054 static enum tls_model
2055 tls_symbolic_operand_type (rtx addr)
2056 {
2057   enum tls_model tls_kind = TLS_MODEL_NONE;
2058   if (GET_CODE (addr) == CONST)
2059     {
2060       poly_int64 addend;
2061       rtx sym = strip_offset (addr, &addend);
2062       if (GET_CODE (sym) == SYMBOL_REF)
2063         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2064     }
2065   else if (GET_CODE (addr) == SYMBOL_REF)
2066     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2067
2068   return tls_kind;
2069 }
2070
2071 /* We'll allow lo_sum's in addresses in our legitimate addresses
2072    so that combine would take care of combining addresses where
2073    necessary, but for generation purposes, we'll generate the address
2074    as :
2075    RTL                               Absolute
2076    tmp = hi (symbol_ref);            adrp  x1, foo
2077    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
2078                                      nop
2079
2080    PIC                               TLS
2081    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
2082    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
2083                                      bl   __tls_get_addr
2084                                      nop
2085
2086    Load TLS symbol, depending on TLS mechanism and TLS access model.
2087
2088    Global Dynamic - Traditional TLS:
2089    adrp tmp, :tlsgd:imm
2090    add  dest, tmp, #:tlsgd_lo12:imm
2091    bl   __tls_get_addr
2092
2093    Global Dynamic - TLS Descriptors:
2094    adrp dest, :tlsdesc:imm
2095    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
2096    add  dest, dest, #:tlsdesc_lo12:imm
2097    blr  tmp
2098    mrs  tp, tpidr_el0
2099    add  dest, dest, tp
2100
2101    Initial Exec:
2102    mrs  tp, tpidr_el0
2103    adrp tmp, :gottprel:imm
2104    ldr  dest, [tmp, #:gottprel_lo12:imm]
2105    add  dest, dest, tp
2106
2107    Local Exec:
2108    mrs  tp, tpidr_el0
2109    add  t0, tp, #:tprel_hi12:imm, lsl #12
2110    add  t0, t0, #:tprel_lo12_nc:imm
2111 */
2112
2113 static void
2114 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2115                                    enum aarch64_symbol_type type)
2116 {
2117   switch (type)
2118     {
2119     case SYMBOL_SMALL_ABSOLUTE:
2120       {
2121         /* In ILP32, the mode of dest can be either SImode or DImode.  */
2122         rtx tmp_reg = dest;
2123         machine_mode mode = GET_MODE (dest);
2124
2125         gcc_assert (mode == Pmode || mode == ptr_mode);
2126
2127         if (can_create_pseudo_p ())
2128           tmp_reg = gen_reg_rtx (mode);
2129
2130         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2131         emit_insn (gen_add_losym (dest, tmp_reg, imm));
2132         return;
2133       }
2134
2135     case SYMBOL_TINY_ABSOLUTE:
2136       emit_insn (gen_rtx_SET (dest, imm));
2137       return;
2138
2139     case SYMBOL_SMALL_GOT_28K:
2140       {
2141         machine_mode mode = GET_MODE (dest);
2142         rtx gp_rtx = pic_offset_table_rtx;
2143         rtx insn;
2144         rtx mem;
2145
2146         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2147            here before rtl expand.  Tree IVOPT will generate rtl pattern to
2148            decide rtx costs, in which case pic_offset_table_rtx is not
2149            initialized.  For that case no need to generate the first adrp
2150            instruction as the final cost for global variable access is
2151            one instruction.  */
2152         if (gp_rtx != NULL)
2153           {
2154             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2155                using the page base as GOT base, the first page may be wasted,
2156                in the worst scenario, there is only 28K space for GOT).
2157
2158                The generate instruction sequence for accessing global variable
2159                is:
2160
2161                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2162
2163                Only one instruction needed. But we must initialize
2164                pic_offset_table_rtx properly.  We generate initialize insn for
2165                every global access, and allow CSE to remove all redundant.
2166
2167                The final instruction sequences will look like the following
2168                for multiply global variables access.
2169
2170                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2171
2172                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2173                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2174                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2175                  ...  */
2176
2177             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2178             crtl->uses_pic_offset_table = 1;
2179             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2180
2181             if (mode != GET_MODE (gp_rtx))
2182              gp_rtx = gen_lowpart (mode, gp_rtx);
2183
2184           }
2185
2186         if (mode == ptr_mode)
2187           {
2188             if (mode == DImode)
2189               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2190             else
2191               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2192
2193             mem = XVECEXP (SET_SRC (insn), 0, 0);
2194           }
2195         else
2196           {
2197             gcc_assert (mode == Pmode);
2198
2199             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2200             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2201           }
2202
2203         /* The operand is expected to be MEM.  Whenever the related insn
2204            pattern changed, above code which calculate mem should be
2205            updated.  */
2206         gcc_assert (GET_CODE (mem) == MEM);
2207         MEM_READONLY_P (mem) = 1;
2208         MEM_NOTRAP_P (mem) = 1;
2209         emit_insn (insn);
2210         return;
2211       }
2212
2213     case SYMBOL_SMALL_GOT_4G:
2214       {
2215         /* In ILP32, the mode of dest can be either SImode or DImode,
2216            while the got entry is always of SImode size.  The mode of
2217            dest depends on how dest is used: if dest is assigned to a
2218            pointer (e.g. in the memory), it has SImode; it may have
2219            DImode if dest is dereferenced to access the memeory.
2220            This is why we have to handle three different ldr_got_small
2221            patterns here (two patterns for ILP32).  */
2222
2223         rtx insn;
2224         rtx mem;
2225         rtx tmp_reg = dest;
2226         machine_mode mode = GET_MODE (dest);
2227
2228         if (can_create_pseudo_p ())
2229           tmp_reg = gen_reg_rtx (mode);
2230
2231         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2232         if (mode == ptr_mode)
2233           {
2234             if (mode == DImode)
2235               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2236             else
2237               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2238
2239             mem = XVECEXP (SET_SRC (insn), 0, 0);
2240           }
2241         else
2242           {
2243             gcc_assert (mode == Pmode);
2244
2245             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2246             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2247           }
2248
2249         gcc_assert (GET_CODE (mem) == MEM);
2250         MEM_READONLY_P (mem) = 1;
2251         MEM_NOTRAP_P (mem) = 1;
2252         emit_insn (insn);
2253         return;
2254       }
2255
2256     case SYMBOL_SMALL_TLSGD:
2257       {
2258         rtx_insn *insns;
2259         machine_mode mode = GET_MODE (dest);
2260         rtx result = gen_rtx_REG (mode, R0_REGNUM);
2261
2262         start_sequence ();
2263         if (TARGET_ILP32)
2264           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2265         else
2266           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2267         insns = get_insns ();
2268         end_sequence ();
2269
2270         RTL_CONST_CALL_P (insns) = 1;
2271         emit_libcall_block (insns, dest, result, imm);
2272         return;
2273       }
2274
2275     case SYMBOL_SMALL_TLSDESC:
2276       {
2277         machine_mode mode = GET_MODE (dest);
2278         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2279         rtx tp;
2280
2281         gcc_assert (mode == Pmode || mode == ptr_mode);
2282
2283         /* In ILP32, the got entry is always of SImode size.  Unlike
2284            small GOT, the dest is fixed at reg 0.  */
2285         if (TARGET_ILP32)
2286           emit_insn (gen_tlsdesc_small_si (imm));
2287         else
2288           emit_insn (gen_tlsdesc_small_di (imm));
2289         tp = aarch64_load_tp (NULL);
2290
2291         if (mode != Pmode)
2292           tp = gen_lowpart (mode, tp);
2293
2294         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2295         if (REG_P (dest))
2296           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2297         return;
2298       }
2299
2300     case SYMBOL_SMALL_TLSIE:
2301       {
2302         /* In ILP32, the mode of dest can be either SImode or DImode,
2303            while the got entry is always of SImode size.  The mode of
2304            dest depends on how dest is used: if dest is assigned to a
2305            pointer (e.g. in the memory), it has SImode; it may have
2306            DImode if dest is dereferenced to access the memeory.
2307            This is why we have to handle three different tlsie_small
2308            patterns here (two patterns for ILP32).  */
2309         machine_mode mode = GET_MODE (dest);
2310         rtx tmp_reg = gen_reg_rtx (mode);
2311         rtx tp = aarch64_load_tp (NULL);
2312
2313         if (mode == ptr_mode)
2314           {
2315             if (mode == DImode)
2316               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2317             else
2318               {
2319                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2320                 tp = gen_lowpart (mode, tp);
2321               }
2322           }
2323         else
2324           {
2325             gcc_assert (mode == Pmode);
2326             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2327           }
2328
2329         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2330         if (REG_P (dest))
2331           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2332         return;
2333       }
2334
2335     case SYMBOL_TLSLE12:
2336     case SYMBOL_TLSLE24:
2337     case SYMBOL_TLSLE32:
2338     case SYMBOL_TLSLE48:
2339       {
2340         machine_mode mode = GET_MODE (dest);
2341         rtx tp = aarch64_load_tp (NULL);
2342
2343         if (mode != Pmode)
2344           tp = gen_lowpart (mode, tp);
2345
2346         switch (type)
2347           {
2348           case SYMBOL_TLSLE12:
2349             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2350                         (dest, tp, imm));
2351             break;
2352           case SYMBOL_TLSLE24:
2353             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2354                         (dest, tp, imm));
2355           break;
2356           case SYMBOL_TLSLE32:
2357             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2358                         (dest, imm));
2359             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2360                         (dest, dest, tp));
2361           break;
2362           case SYMBOL_TLSLE48:
2363             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2364                         (dest, imm));
2365             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2366                         (dest, dest, tp));
2367             break;
2368           default:
2369             gcc_unreachable ();
2370           }
2371
2372         if (REG_P (dest))
2373           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2374         return;
2375       }
2376
2377     case SYMBOL_TINY_GOT:
2378       emit_insn (gen_ldr_got_tiny (dest, imm));
2379       return;
2380
2381     case SYMBOL_TINY_TLSIE:
2382       {
2383         machine_mode mode = GET_MODE (dest);
2384         rtx tp = aarch64_load_tp (NULL);
2385
2386         if (mode == ptr_mode)
2387           {
2388             if (mode == DImode)
2389               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2390             else
2391               {
2392                 tp = gen_lowpart (mode, tp);
2393                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2394               }
2395           }
2396         else
2397           {
2398             gcc_assert (mode == Pmode);
2399             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2400           }
2401
2402         if (REG_P (dest))
2403           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2404         return;
2405       }
2406
2407     default:
2408       gcc_unreachable ();
2409     }
2410 }
2411
2412 /* Emit a move from SRC to DEST.  Assume that the move expanders can
2413    handle all moves if !can_create_pseudo_p ().  The distinction is
2414    important because, unlike emit_move_insn, the move expanders know
2415    how to force Pmode objects into the constant pool even when the
2416    constant pool address is not itself legitimate.  */
2417 static rtx
2418 aarch64_emit_move (rtx dest, rtx src)
2419 {
2420   return (can_create_pseudo_p ()
2421           ? emit_move_insn (dest, src)
2422           : emit_move_insn_1 (dest, src));
2423 }
2424
2425 /* Apply UNOPTAB to OP and store the result in DEST.  */
2426
2427 static void
2428 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2429 {
2430   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2431   if (dest != tmp)
2432     emit_move_insn (dest, tmp);
2433 }
2434
2435 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
2436
2437 static void
2438 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2439 {
2440   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2441                           OPTAB_DIRECT);
2442   if (dest != tmp)
2443     emit_move_insn (dest, tmp);
2444 }
2445
2446 /* Split a 128-bit move operation into two 64-bit move operations,
2447    taking care to handle partial overlap of register to register
2448    copies.  Special cases are needed when moving between GP regs and
2449    FP regs.  SRC can be a register, constant or memory; DST a register
2450    or memory.  If either operand is memory it must not have any side
2451    effects.  */
2452 void
2453 aarch64_split_128bit_move (rtx dst, rtx src)
2454 {
2455   rtx dst_lo, dst_hi;
2456   rtx src_lo, src_hi;
2457
2458   machine_mode mode = GET_MODE (dst);
2459
2460   gcc_assert (mode == TImode || mode == TFmode);
2461   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2462   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2463
2464   if (REG_P (dst) && REG_P (src))
2465     {
2466       int src_regno = REGNO (src);
2467       int dst_regno = REGNO (dst);
2468
2469       /* Handle FP <-> GP regs.  */
2470       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2471         {
2472           src_lo = gen_lowpart (word_mode, src);
2473           src_hi = gen_highpart (word_mode, src);
2474
2475           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2476           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2477           return;
2478         }
2479       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2480         {
2481           dst_lo = gen_lowpart (word_mode, dst);
2482           dst_hi = gen_highpart (word_mode, dst);
2483
2484           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2485           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2486           return;
2487         }
2488     }
2489
2490   dst_lo = gen_lowpart (word_mode, dst);
2491   dst_hi = gen_highpart (word_mode, dst);
2492   src_lo = gen_lowpart (word_mode, src);
2493   src_hi = gen_highpart_mode (word_mode, mode, src);
2494
2495   /* At most one pairing may overlap.  */
2496   if (reg_overlap_mentioned_p (dst_lo, src_hi))
2497     {
2498       aarch64_emit_move (dst_hi, src_hi);
2499       aarch64_emit_move (dst_lo, src_lo);
2500     }
2501   else
2502     {
2503       aarch64_emit_move (dst_lo, src_lo);
2504       aarch64_emit_move (dst_hi, src_hi);
2505     }
2506 }
2507
2508 bool
2509 aarch64_split_128bit_move_p (rtx dst, rtx src)
2510 {
2511   return (! REG_P (src)
2512           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2513 }
2514
2515 /* Split a complex SIMD combine.  */
2516
2517 void
2518 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2519 {
2520   machine_mode src_mode = GET_MODE (src1);
2521   machine_mode dst_mode = GET_MODE (dst);
2522
2523   gcc_assert (VECTOR_MODE_P (dst_mode));
2524   gcc_assert (register_operand (dst, dst_mode)
2525               && register_operand (src1, src_mode)
2526               && register_operand (src2, src_mode));
2527
2528   emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2529   return;
2530 }
2531
2532 /* Split a complex SIMD move.  */
2533
2534 void
2535 aarch64_split_simd_move (rtx dst, rtx src)
2536 {
2537   machine_mode src_mode = GET_MODE (src);
2538   machine_mode dst_mode = GET_MODE (dst);
2539
2540   gcc_assert (VECTOR_MODE_P (dst_mode));
2541
2542   if (REG_P (dst) && REG_P (src))
2543     {
2544       gcc_assert (VECTOR_MODE_P (src_mode));
2545       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2546     }
2547 }
2548
2549 bool
2550 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2551                               machine_mode ymode, rtx y)
2552 {
2553   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2554   gcc_assert (r != NULL);
2555   return rtx_equal_p (x, r);
2556 }
2557
2558
2559 /* Return TARGET if it is nonnull and a register of mode MODE.
2560    Otherwise, return a fresh register of mode MODE if we can,
2561    or TARGET reinterpreted as MODE if we can't.  */
2562
2563 static rtx
2564 aarch64_target_reg (rtx target, machine_mode mode)
2565 {
2566   if (target && REG_P (target) && GET_MODE (target) == mode)
2567     return target;
2568   if (!can_create_pseudo_p ())
2569     {
2570       gcc_assert (target);
2571       return gen_lowpart (mode, target);
2572     }
2573   return gen_reg_rtx (mode);
2574 }
2575
2576 /* Return a register that contains the constant in BUILDER, given that
2577    the constant is a legitimate move operand.  Use TARGET as the register
2578    if it is nonnull and convenient.  */
2579
2580 static rtx
2581 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
2582 {
2583   rtx src = builder.build ();
2584   target = aarch64_target_reg (target, GET_MODE (src));
2585   emit_insn (gen_rtx_SET (target, src));
2586   return target;
2587 }
2588
2589 static rtx
2590 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2591 {
2592   if (can_create_pseudo_p ())
2593     return force_reg (mode, value);
2594   else
2595     {
2596       gcc_assert (x);
2597       aarch64_emit_move (x, value);
2598       return x;
2599     }
2600 }
2601
2602 /* Return true if predicate value X is a constant in which every element
2603    is a CONST_INT.  When returning true, describe X in BUILDER as a VNx16BI
2604    value, i.e. as a predicate in which all bits are significant.  */
2605
2606 static bool
2607 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
2608 {
2609   if (GET_CODE (x) != CONST_VECTOR)
2610     return false;
2611
2612   unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
2613                                              GET_MODE_NUNITS (GET_MODE (x)));
2614   unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
2615   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
2616   builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
2617
2618   unsigned int nelts = const_vector_encoded_nelts (x);
2619   for (unsigned int i = 0; i < nelts; ++i)
2620     {
2621       rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
2622       if (!CONST_INT_P (elt))
2623         return false;
2624
2625       builder.quick_push (elt);
2626       for (unsigned int j = 1; j < factor; ++j)
2627         builder.quick_push (const0_rtx);
2628     }
2629   builder.finalize ();
2630   return true;
2631 }
2632
2633 /* BUILDER contains a predicate constant of mode VNx16BI.  Return the
2634    widest predicate element size it can have (that is, the largest size
2635    for which each element would still be 0 or 1).  */
2636
2637 unsigned int
2638 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
2639 {
2640   /* Start with the most optimistic assumption: that we only need
2641      one bit per pattern.  This is what we will use if only the first
2642      bit in each pattern is ever set.  */
2643   unsigned int mask = GET_MODE_SIZE (DImode);
2644   mask |= builder.npatterns ();
2645
2646   /* Look for set bits.  */
2647   unsigned int nelts = builder.encoded_nelts ();
2648   for (unsigned int i = 1; i < nelts; ++i)
2649     if (INTVAL (builder.elt (i)) != 0)
2650       {
2651         if (i & 1)
2652           return 1;
2653         mask |= i;
2654       }
2655   return mask & -mask;
2656 }
2657
2658 /* BUILDER is a predicate constant of mode VNx16BI.  Consider the value
2659    that the constant would have with predicate element size ELT_SIZE
2660    (ignoring the upper bits in each element) and return:
2661
2662    * -1 if all bits are set
2663    * N if the predicate has N leading set bits followed by all clear bits
2664    * 0 if the predicate does not have any of these forms.  */
2665
2666 int
2667 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
2668                               unsigned int elt_size)
2669 {
2670   /* If nelts_per_pattern is 3, we have set bits followed by clear bits
2671      followed by set bits.  */
2672   if (builder.nelts_per_pattern () == 3)
2673     return 0;
2674
2675   /* Skip over leading set bits.  */
2676   unsigned int nelts = builder.encoded_nelts ();
2677   unsigned int i = 0;
2678   for (; i < nelts; i += elt_size)
2679     if (INTVAL (builder.elt (i)) == 0)
2680       break;
2681   unsigned int vl = i / elt_size;
2682
2683   /* Check for the all-true case.  */
2684   if (i == nelts)
2685     return -1;
2686
2687   /* If nelts_per_pattern is 1, then either VL is zero, or we have a
2688      repeating pattern of set bits followed by clear bits.  */
2689   if (builder.nelts_per_pattern () != 2)
2690     return 0;
2691
2692   /* We have a "foreground" value and a duplicated "background" value.
2693      If the background might repeat and the last set bit belongs to it,
2694      we might have set bits followed by clear bits followed by set bits.  */
2695   if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
2696     return 0;
2697
2698   /* Make sure that the rest are all clear.  */
2699   for (; i < nelts; i += elt_size)
2700     if (INTVAL (builder.elt (i)) != 0)
2701       return 0;
2702
2703   return vl;
2704 }
2705
2706 /* See if there is an svpattern that encodes an SVE predicate of mode
2707    PRED_MODE in which the first VL bits are set and the rest are clear.
2708    Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
2709    A VL of -1 indicates an all-true vector.  */
2710
2711 aarch64_svpattern
2712 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
2713 {
2714   if (vl < 0)
2715     return AARCH64_SV_ALL;
2716
2717   if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
2718     return AARCH64_NUM_SVPATTERNS;
2719
2720   if (vl >= 1 && vl <= 8)
2721     return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
2722
2723   if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
2724     return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
2725
2726   int max_vl;
2727   if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
2728     {
2729       if (vl == (max_vl / 3) * 3)
2730         return AARCH64_SV_MUL3;
2731       /* These would only trigger for non-power-of-2 lengths.  */
2732       if (vl == (max_vl & -4))
2733         return AARCH64_SV_MUL4;
2734       if (vl == (1 << floor_log2 (max_vl)))
2735         return AARCH64_SV_POW2;
2736       if (vl == max_vl)
2737         return AARCH64_SV_ALL;
2738     }
2739   return AARCH64_NUM_SVPATTERNS;
2740 }
2741
2742 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
2743    bits has the lowest bit set and the upper bits clear.  This is the
2744    VNx16BImode equivalent of a PTRUE for controlling elements of
2745    ELT_SIZE bytes.  However, because the constant is VNx16BImode,
2746    all bits are significant, even the upper zeros.  */
2747
2748 rtx
2749 aarch64_ptrue_all (unsigned int elt_size)
2750 {
2751   rtx_vector_builder builder (VNx16BImode, elt_size, 1);
2752   builder.quick_push (const1_rtx);
2753   for (unsigned int i = 1; i < elt_size; ++i)
2754     builder.quick_push (const0_rtx);
2755   return builder.build ();
2756 }
2757
2758 /* Return an all-true predicate register of mode MODE.  */
2759
2760 rtx
2761 aarch64_ptrue_reg (machine_mode mode)
2762 {
2763   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2764   rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
2765   return gen_lowpart (mode, reg);
2766 }
2767
2768 /* Return an all-false predicate register of mode MODE.  */
2769
2770 rtx
2771 aarch64_pfalse_reg (machine_mode mode)
2772 {
2773   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2774   rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
2775   return gen_lowpart (mode, reg);
2776 }
2777
2778 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
2779    true, or alternatively if we know that the operation predicated by
2780    PRED1[0] is safe to perform whenever PRED2 is true.  PRED1[1] is a
2781    aarch64_sve_gp_strictness operand that describes the operation
2782    predicated by PRED1[0].  */
2783
2784 bool
2785 aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
2786 {
2787   machine_mode mode = GET_MODE (pred2);
2788   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2789               && mode == GET_MODE (pred1[0])
2790               && aarch64_sve_gp_strictness (pred1[1], SImode));
2791   return (pred1[0] == CONSTM1_RTX (mode)
2792           || INTVAL (pred1[1]) == SVE_RELAXED_GP
2793           || rtx_equal_p (pred1[0], pred2));
2794 }
2795
2796 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
2797    for it.  PRED2[0] is the predicate for the instruction whose result
2798    is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
2799    for it.  Return true if we can prove that the two predicates are
2800    equivalent for PTEST purposes; that is, if we can replace PRED2[0]
2801    with PRED1[0] without changing behavior.  */
2802
2803 bool
2804 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
2805 {
2806   machine_mode mode = GET_MODE (pred1[0]);
2807   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2808               && mode == GET_MODE (pred2[0])
2809               && aarch64_sve_ptrue_flag (pred1[1], SImode)
2810               && aarch64_sve_ptrue_flag (pred2[1], SImode));
2811
2812   bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
2813                    || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
2814   bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
2815                    || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
2816   return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
2817 }
2818
2819 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
2820    DATA_MODE, and return the result in a predicate of mode PRED_MODE.
2821    Use TARGET as the target register if nonnull and convenient.  */
2822
2823 static rtx
2824 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
2825                           machine_mode data_mode, rtx op1, rtx op2)
2826 {
2827   insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
2828   expand_operand ops[5];
2829   create_output_operand (&ops[0], target, pred_mode);
2830   create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
2831   create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
2832   create_input_operand (&ops[3], op1, data_mode);
2833   create_input_operand (&ops[4], op2, data_mode);
2834   expand_insn (icode, 5, ops);
2835   return ops[0].value;
2836 }
2837
2838 /* Use a comparison to convert integer vector SRC into MODE, which is
2839    the corresponding SVE predicate mode.  Use TARGET for the result
2840    if it's nonnull and convenient.  */
2841
2842 static rtx
2843 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
2844 {
2845   machine_mode src_mode = GET_MODE (src);
2846   return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
2847                                    src, CONST0_RTX (src_mode));
2848 }
2849
2850 /* Return true if we can move VALUE into a register using a single
2851    CNT[BHWD] instruction.  */
2852
2853 static bool
2854 aarch64_sve_cnt_immediate_p (poly_int64 value)
2855 {
2856   HOST_WIDE_INT factor = value.coeffs[0];
2857   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2858   return (value.coeffs[1] == factor
2859           && IN_RANGE (factor, 2, 16 * 16)
2860           && (factor & 1) == 0
2861           && factor <= 16 * (factor & -factor));
2862 }
2863
2864 /* Likewise for rtx X.  */
2865
2866 bool
2867 aarch64_sve_cnt_immediate_p (rtx x)
2868 {
2869   poly_int64 value;
2870   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2871 }
2872
2873 /* Return the asm string for an instruction with a CNT-like vector size
2874    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2875    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2876    first part of the operands template (the part that comes before the
2877    vector size itself).  FACTOR is the number of quadwords.
2878    NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2879    If it is zero, we can use any element size.  */
2880
2881 static char *
2882 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2883                                   unsigned int factor,
2884                                   unsigned int nelts_per_vq)
2885 {
2886   static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2887
2888   if (nelts_per_vq == 0)
2889     /* There is some overlap in the ranges of the four CNT instructions.
2890        Here we always use the smallest possible element size, so that the
2891        multiplier is 1 whereever possible.  */
2892     nelts_per_vq = factor & -factor;
2893   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2894   gcc_assert (IN_RANGE (shift, 1, 4));
2895   char suffix = "dwhb"[shift - 1];
2896
2897   factor >>= shift;
2898   unsigned int written;
2899   if (factor == 1)
2900     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2901                         prefix, suffix, operands);
2902   else
2903     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2904                         prefix, suffix, operands, factor);
2905   gcc_assert (written < sizeof (buffer));
2906   return buffer;
2907 }
2908
2909 /* Return the asm string for an instruction with a CNT-like vector size
2910    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2911    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2912    first part of the operands template (the part that comes before the
2913    vector size itself).  X is the value of the vector size operand,
2914    as a polynomial integer rtx.  */
2915
2916 char *
2917 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2918                                   rtx x)
2919 {
2920   poly_int64 value = rtx_to_poly_int64 (x);
2921   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2922   return aarch64_output_sve_cnt_immediate (prefix, operands,
2923                                            value.coeffs[1], 0);
2924 }
2925
2926 /* Return true if we can add VALUE to a register using a single ADDVL
2927    or ADDPL instruction.  */
2928
2929 static bool
2930 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2931 {
2932   HOST_WIDE_INT factor = value.coeffs[0];
2933   if (factor == 0 || value.coeffs[1] != factor)
2934     return false;
2935   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2936      and a value of 16 is one vector width.  */
2937   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2938           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2939 }
2940
2941 /* Likewise for rtx X.  */
2942
2943 bool
2944 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2945 {
2946   poly_int64 value;
2947   return (poly_int_rtx_p (x, &value)
2948           && aarch64_sve_addvl_addpl_immediate_p (value));
2949 }
2950
2951 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2952    and storing the result in operand 0.  */
2953
2954 char *
2955 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2956 {
2957   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2958   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2959   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2960
2961   /* Use INC or DEC if possible.  */
2962   if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2963     {
2964       if (aarch64_sve_cnt_immediate_p (offset_value))
2965         return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2966                                                  offset_value.coeffs[1], 0);
2967       if (aarch64_sve_cnt_immediate_p (-offset_value))
2968         return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2969                                                  -offset_value.coeffs[1], 0);
2970     }
2971
2972   int factor = offset_value.coeffs[1];
2973   if ((factor & 15) == 0)
2974     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2975   else
2976     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2977   return buffer;
2978 }
2979
2980 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2981    instruction.  If it is, store the number of elements in each vector
2982    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2983    factor in *FACTOR_OUT (if nonnull).  */
2984
2985 bool
2986 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2987                                  unsigned int *nelts_per_vq_out)
2988 {
2989   rtx elt;
2990   poly_int64 value;
2991
2992   if (!const_vec_duplicate_p (x, &elt)
2993       || !poly_int_rtx_p (elt, &value))
2994     return false;
2995
2996   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2997   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2998     /* There's no vector INCB.  */
2999     return false;
3000
3001   HOST_WIDE_INT factor = value.coeffs[0];
3002   if (value.coeffs[1] != factor)
3003     return false;
3004
3005   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
3006   if ((factor % nelts_per_vq) != 0
3007       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
3008     return false;
3009
3010   if (factor_out)
3011     *factor_out = factor;
3012   if (nelts_per_vq_out)
3013     *nelts_per_vq_out = nelts_per_vq;
3014   return true;
3015 }
3016
3017 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3018    instruction.  */
3019
3020 bool
3021 aarch64_sve_inc_dec_immediate_p (rtx x)
3022 {
3023   return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
3024 }
3025
3026 /* Return the asm template for an SVE vector INC or DEC instruction.
3027    OPERANDS gives the operands before the vector count and X is the
3028    value of the vector count operand itself.  */
3029
3030 char *
3031 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
3032 {
3033   int factor;
3034   unsigned int nelts_per_vq;
3035   if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
3036     gcc_unreachable ();
3037   if (factor < 0)
3038     return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
3039                                              nelts_per_vq);
3040   else
3041     return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
3042                                              nelts_per_vq);
3043 }
3044
3045 static int
3046 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
3047                                 scalar_int_mode mode)
3048 {
3049   int i;
3050   unsigned HOST_WIDE_INT val, val2, mask;
3051   int one_match, zero_match;
3052   int num_insns;
3053
3054   val = INTVAL (imm);
3055
3056   if (aarch64_move_imm (val, mode))
3057     {
3058       if (generate)
3059         emit_insn (gen_rtx_SET (dest, imm));
3060       return 1;
3061     }
3062
3063   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3064      (with XXXX non-zero). In that case check to see if the move can be done in
3065      a smaller mode.  */
3066   val2 = val & 0xffffffff;
3067   if (mode == DImode
3068       && aarch64_move_imm (val2, SImode)
3069       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
3070     {
3071       if (generate)
3072         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3073
3074       /* Check if we have to emit a second instruction by checking to see
3075          if any of the upper 32 bits of the original DI mode value is set.  */
3076       if (val == val2)
3077         return 1;
3078
3079       i = (val >> 48) ? 48 : 32;
3080
3081       if (generate)
3082          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3083                                     GEN_INT ((val >> i) & 0xffff)));
3084
3085       return 2;
3086     }
3087
3088   if ((val >> 32) == 0 || mode == SImode)
3089     {
3090       if (generate)
3091         {
3092           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
3093           if (mode == SImode)
3094             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
3095                                        GEN_INT ((val >> 16) & 0xffff)));
3096           else
3097             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
3098                                        GEN_INT ((val >> 16) & 0xffff)));
3099         }
3100       return 2;
3101     }
3102
3103   /* Remaining cases are all for DImode.  */
3104
3105   mask = 0xffff;
3106   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
3107     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
3108   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
3109     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
3110
3111   if (zero_match != 2 && one_match != 2)
3112     {
3113       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3114          For a 64-bit bitmask try whether changing 16 bits to all ones or
3115          zeroes creates a valid bitmask.  To check any repeated bitmask,
3116          try using 16 bits from the other 32-bit half of val.  */
3117
3118       for (i = 0; i < 64; i += 16, mask <<= 16)
3119         {
3120           val2 = val & ~mask;
3121           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3122             break;
3123           val2 = val | mask;
3124           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3125             break;
3126           val2 = val2 & ~mask;
3127           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
3128           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3129             break;
3130         }
3131       if (i != 64)
3132         {
3133           if (generate)
3134             {
3135               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3136               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3137                                          GEN_INT ((val >> i) & 0xffff)));
3138             }
3139           return 2;
3140         }
3141     }
3142
3143   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3144      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
3145      otherwise skip zero bits.  */
3146
3147   num_insns = 1;
3148   mask = 0xffff;
3149   val2 = one_match > zero_match ? ~val : val;
3150   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
3151
3152   if (generate)
3153     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
3154                                            ? (val | ~(mask << i))
3155                                            : (val & (mask << i)))));
3156   for (i += 16; i < 64; i += 16)
3157     {
3158       if ((val2 & (mask << i)) == 0)
3159         continue;
3160       if (generate)
3161         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3162                                    GEN_INT ((val >> i) & 0xffff)));
3163       num_insns ++;
3164     }
3165
3166   return num_insns;
3167 }
3168
3169 /* Return whether imm is a 128-bit immediate which is simple enough to
3170    expand inline.  */
3171 bool
3172 aarch64_mov128_immediate (rtx imm)
3173 {
3174   if (GET_CODE (imm) == CONST_INT)
3175     return true;
3176
3177   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
3178
3179   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
3180   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
3181
3182   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
3183          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
3184 }
3185
3186
3187 /* Return the number of temporary registers that aarch64_add_offset_1
3188    would need to add OFFSET to a register.  */
3189
3190 static unsigned int
3191 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
3192 {
3193   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
3194 }
3195
3196 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
3197    a non-polynomial OFFSET.  MODE is the mode of the addition.
3198    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3199    be set and CFA adjustments added to the generated instructions.
3200
3201    TEMP1, if nonnull, is a register of mode MODE that can be used as a
3202    temporary if register allocation is already complete.  This temporary
3203    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
3204    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3205    the immediate again.
3206
3207    Since this function may be used to adjust the stack pointer, we must
3208    ensure that it cannot cause transient stack deallocation (for example
3209    by first incrementing SP and then decrementing when adjusting by a
3210    large immediate).  */
3211
3212 static void
3213 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
3214                       rtx src, HOST_WIDE_INT offset, rtx temp1,
3215                       bool frame_related_p, bool emit_move_imm)
3216 {
3217   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3218   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3219
3220   HOST_WIDE_INT moffset = abs_hwi (offset);
3221   rtx_insn *insn;
3222
3223   if (!moffset)
3224     {
3225       if (!rtx_equal_p (dest, src))
3226         {
3227           insn = emit_insn (gen_rtx_SET (dest, src));
3228           RTX_FRAME_RELATED_P (insn) = frame_related_p;
3229         }
3230       return;
3231     }
3232
3233   /* Single instruction adjustment.  */
3234   if (aarch64_uimm12_shift (moffset))
3235     {
3236       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
3237       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3238       return;
3239     }
3240
3241   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3242      and either:
3243
3244      a) the offset cannot be loaded by a 16-bit move or
3245      b) there is no spare register into which we can move it.  */
3246   if (moffset < 0x1000000
3247       && ((!temp1 && !can_create_pseudo_p ())
3248           || !aarch64_move_imm (moffset, mode)))
3249     {
3250       HOST_WIDE_INT low_off = moffset & 0xfff;
3251
3252       low_off = offset < 0 ? -low_off : low_off;
3253       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
3254       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3255       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
3256       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3257       return;
3258     }
3259
3260   /* Emit a move immediate if required and an addition/subtraction.  */
3261   if (emit_move_imm)
3262     {
3263       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
3264       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
3265     }
3266   insn = emit_insn (offset < 0
3267                     ? gen_sub3_insn (dest, src, temp1)
3268                     : gen_add3_insn (dest, src, temp1));
3269   if (frame_related_p)
3270     {
3271       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3272       rtx adj = plus_constant (mode, src, offset);
3273       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
3274     }
3275 }
3276
3277 /* Return the number of temporary registers that aarch64_add_offset
3278    would need to move OFFSET into a register or add OFFSET to a register;
3279    ADD_P is true if we want the latter rather than the former.  */
3280
3281 static unsigned int
3282 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
3283 {
3284   /* This follows the same structure as aarch64_add_offset.  */
3285   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
3286     return 0;
3287
3288   unsigned int count = 0;
3289   HOST_WIDE_INT factor = offset.coeffs[1];
3290   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3291   poly_int64 poly_offset (factor, factor);
3292   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3293     /* Need one register for the ADDVL/ADDPL result.  */
3294     count += 1;
3295   else if (factor != 0)
3296     {
3297       factor = abs (factor);
3298       if (factor > 16 * (factor & -factor))
3299         /* Need one register for the CNT result and one for the multiplication
3300            factor.  If necessary, the second temporary can be reused for the
3301            constant part of the offset.  */
3302         return 2;
3303       /* Need one register for the CNT result (which might then
3304          be shifted).  */
3305       count += 1;
3306     }
3307   return count + aarch64_add_offset_1_temporaries (constant);
3308 }
3309
3310 /* If X can be represented as a poly_int64, return the number
3311    of temporaries that are required to add it to a register.
3312    Return -1 otherwise.  */
3313
3314 int
3315 aarch64_add_offset_temporaries (rtx x)
3316 {
3317   poly_int64 offset;
3318   if (!poly_int_rtx_p (x, &offset))
3319     return -1;
3320   return aarch64_offset_temporaries (true, offset);
3321 }
3322
3323 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
3324    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3325    be set and CFA adjustments added to the generated instructions.
3326
3327    TEMP1, if nonnull, is a register of mode MODE that can be used as a
3328    temporary if register allocation is already complete.  This temporary
3329    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3330    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3331    false to avoid emitting the immediate again.
3332
3333    TEMP2, if nonnull, is a second temporary register that doesn't
3334    overlap either DEST or REG.
3335
3336    Since this function may be used to adjust the stack pointer, we must
3337    ensure that it cannot cause transient stack deallocation (for example
3338    by first incrementing SP and then decrementing when adjusting by a
3339    large immediate).  */
3340
3341 static void
3342 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3343                     poly_int64 offset, rtx temp1, rtx temp2,
3344                     bool frame_related_p, bool emit_move_imm = true)
3345 {
3346   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3347   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3348   gcc_assert (temp1 == NULL_RTX
3349               || !frame_related_p
3350               || !reg_overlap_mentioned_p (temp1, dest));
3351   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
3352
3353   /* Try using ADDVL or ADDPL to add the whole value.  */
3354   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
3355     {
3356       rtx offset_rtx = gen_int_mode (offset, mode);
3357       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3358       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3359       return;
3360     }
3361
3362   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3363      SVE vector register, over and above the minimum size of 128 bits.
3364      This is equivalent to half the value returned by CNTD with a
3365      vector shape of ALL.  */
3366   HOST_WIDE_INT factor = offset.coeffs[1];
3367   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3368
3369   /* Try using ADDVL or ADDPL to add the VG-based part.  */
3370   poly_int64 poly_offset (factor, factor);
3371   if (src != const0_rtx
3372       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3373     {
3374       rtx offset_rtx = gen_int_mode (poly_offset, mode);
3375       if (frame_related_p)
3376         {
3377           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3378           RTX_FRAME_RELATED_P (insn) = true;
3379           src = dest;
3380         }
3381       else
3382         {
3383           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
3384           src = aarch64_force_temporary (mode, temp1, addr);
3385           temp1 = temp2;
3386           temp2 = NULL_RTX;
3387         }
3388     }
3389   /* Otherwise use a CNT-based sequence.  */
3390   else if (factor != 0)
3391     {
3392       /* Use a subtraction if we have a negative factor.  */
3393       rtx_code code = PLUS;
3394       if (factor < 0)
3395         {
3396           factor = -factor;
3397           code = MINUS;
3398         }
3399
3400       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
3401          into the multiplication.  */
3402       rtx val;
3403       int shift = 0;
3404       if (factor & 1)
3405         /* Use a right shift by 1.  */
3406         shift = -1;
3407       else
3408         factor /= 2;
3409       HOST_WIDE_INT low_bit = factor & -factor;
3410       if (factor <= 16 * low_bit)
3411         {
3412           if (factor > 16 * 8)
3413             {
3414               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3415                  the value with the minimum multiplier and shift it into
3416                  position.  */
3417               int extra_shift = exact_log2 (low_bit);
3418               shift += extra_shift;
3419               factor >>= extra_shift;
3420             }
3421           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3422         }
3423       else
3424         {
3425           /* Use CNTD, then multiply it by FACTOR.  */
3426           val = gen_int_mode (poly_int64 (2, 2), mode);
3427           val = aarch64_force_temporary (mode, temp1, val);
3428
3429           /* Go back to using a negative multiplication factor if we have
3430              no register from which to subtract.  */
3431           if (code == MINUS && src == const0_rtx)
3432             {
3433               factor = -factor;
3434               code = PLUS;
3435             }
3436           rtx coeff1 = gen_int_mode (factor, mode);
3437           coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3438           val = gen_rtx_MULT (mode, val, coeff1);
3439         }
3440
3441       if (shift > 0)
3442         {
3443           /* Multiply by 1 << SHIFT.  */
3444           val = aarch64_force_temporary (mode, temp1, val);
3445           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3446         }
3447       else if (shift == -1)
3448         {
3449           /* Divide by 2.  */
3450           val = aarch64_force_temporary (mode, temp1, val);
3451           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3452         }
3453
3454       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
3455       if (src != const0_rtx)
3456         {
3457           val = aarch64_force_temporary (mode, temp1, val);
3458           val = gen_rtx_fmt_ee (code, mode, src, val);
3459         }
3460       else if (code == MINUS)
3461         {
3462           val = aarch64_force_temporary (mode, temp1, val);
3463           val = gen_rtx_NEG (mode, val);
3464         }
3465
3466       if (constant == 0 || frame_related_p)
3467         {
3468           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3469           if (frame_related_p)
3470             {
3471               RTX_FRAME_RELATED_P (insn) = true;
3472               add_reg_note (insn, REG_CFA_ADJUST_CFA,
3473                             gen_rtx_SET (dest, plus_constant (Pmode, src,
3474                                                               poly_offset)));
3475             }
3476           src = dest;
3477           if (constant == 0)
3478             return;
3479         }
3480       else
3481         {
3482           src = aarch64_force_temporary (mode, temp1, val);
3483           temp1 = temp2;
3484           temp2 = NULL_RTX;
3485         }
3486
3487       emit_move_imm = true;
3488     }
3489
3490   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3491                         frame_related_p, emit_move_imm);
3492 }
3493
3494 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3495    than a poly_int64.  */
3496
3497 void
3498 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3499                           rtx offset_rtx, rtx temp1, rtx temp2)
3500 {
3501   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3502                       temp1, temp2, false);
3503 }
3504
3505 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3506    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
3507    if TEMP1 already contains abs (DELTA).  */
3508
3509 static inline void
3510 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3511 {
3512   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3513                       temp1, temp2, true, emit_move_imm);
3514 }
3515
3516 /* Subtract DELTA from the stack pointer, marking the instructions
3517    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
3518    if nonnull.  */
3519
3520 static inline void
3521 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3522                 bool emit_move_imm = true)
3523 {
3524   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3525                       temp1, temp2, frame_related_p, emit_move_imm);
3526 }
3527
3528 /* Set DEST to (vec_series BASE STEP).  */
3529
3530 static void
3531 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3532 {
3533   machine_mode mode = GET_MODE (dest);
3534   scalar_mode inner = GET_MODE_INNER (mode);
3535
3536   /* Each operand can be a register or an immediate in the range [-16, 15].  */
3537   if (!aarch64_sve_index_immediate_p (base))
3538     base = force_reg (inner, base);
3539   if (!aarch64_sve_index_immediate_p (step))
3540     step = force_reg (inner, step);
3541
3542   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3543 }
3544
3545 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
3546    register of mode MODE.  Use TARGET for the result if it's nonnull
3547    and convenient.
3548
3549    The two vector modes must have the same element mode.  The behavior
3550    is to duplicate architectural lane N of SRC into architectural lanes
3551    N + I * STEP of the result.  On big-endian targets, architectural
3552    lane 0 of an Advanced SIMD vector is the last element of the vector
3553    in memory layout, so for big-endian targets this operation has the
3554    effect of reversing SRC before duplicating it.  Callers need to
3555    account for this.  */
3556
3557 rtx
3558 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
3559 {
3560   machine_mode src_mode = GET_MODE (src);
3561   gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
3562   insn_code icode = (BYTES_BIG_ENDIAN
3563                      ? code_for_aarch64_vec_duplicate_vq_be (mode)
3564                      : code_for_aarch64_vec_duplicate_vq_le (mode));
3565
3566   unsigned int i = 0;
3567   expand_operand ops[3];
3568   create_output_operand (&ops[i++], target, mode);
3569   create_output_operand (&ops[i++], src, src_mode);
3570   if (BYTES_BIG_ENDIAN)
3571     {
3572       /* Create a PARALLEL describing the reversal of SRC.  */
3573       unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
3574       rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
3575                                                   nelts_per_vq - 1, -1);
3576       create_fixed_operand (&ops[i++], sel);
3577     }
3578   expand_insn (icode, i, ops);
3579   return ops[0].value;
3580 }
3581
3582 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
3583    the memory image into DEST.  Return true on success.  */
3584
3585 static bool
3586 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
3587 {
3588   src = force_const_mem (GET_MODE (src), src);
3589   if (!src)
3590     return false;
3591
3592   /* Make sure that the address is legitimate.  */
3593   if (!aarch64_sve_ld1rq_operand_p (src))
3594     {
3595       rtx addr = force_reg (Pmode, XEXP (src, 0));
3596       src = replace_equiv_address (src, addr);
3597     }
3598
3599   machine_mode mode = GET_MODE (dest);
3600   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3601   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3602   rtx ptrue = aarch64_ptrue_reg (pred_mode);
3603   emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
3604   return true;
3605 }
3606
3607 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
3608    SVE data mode and isn't a legitimate constant.  Use TARGET for the
3609    result if convenient.
3610
3611    The returned register can have whatever mode seems most natural
3612    given the contents of SRC.  */
3613
3614 static rtx
3615 aarch64_expand_sve_const_vector (rtx target, rtx src)
3616 {
3617   machine_mode mode = GET_MODE (src);
3618   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3619   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3620   scalar_mode elt_mode = GET_MODE_INNER (mode);
3621   unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
3622   unsigned int encoded_bits = npatterns * nelts_per_pattern * elt_bits;
3623
3624   if (nelts_per_pattern == 1 && encoded_bits == 128)
3625     {
3626       /* The constant is a duplicated quadword but can't be narrowed
3627          beyond a quadword.  Get the memory image of the first quadword
3628          as a 128-bit vector and try using LD1RQ to load it from memory.
3629
3630          The effect for both endiannesses is to load memory lane N into
3631          architectural lanes N + I * STEP of the result.  On big-endian
3632          targets, the layout of the 128-bit vector in an Advanced SIMD
3633          register would be different from its layout in an SVE register,
3634          but this 128-bit vector is a memory value only.  */
3635       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3636       rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
3637       if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
3638         return target;
3639     }
3640
3641   if (nelts_per_pattern == 1 && encoded_bits < 128)
3642     {
3643       /* The vector is a repeating sequence of 64 bits or fewer.
3644          See if we can load them using an Advanced SIMD move and then
3645          duplicate it to fill a vector.  This is better than using a GPR
3646          move because it keeps everything in the same register file.  */
3647       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3648       rtx_vector_builder builder (vq_mode, npatterns, 1);
3649       for (unsigned int i = 0; i < npatterns; ++i)
3650         {
3651           /* We want memory lane N to go into architectural lane N,
3652              so reverse for big-endian targets.  The DUP .Q pattern
3653              has a compensating reverse built-in.  */
3654           unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
3655           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
3656         }
3657       rtx vq_src = builder.build ();
3658       if (aarch64_simd_valid_immediate (vq_src, NULL))
3659         {
3660           vq_src = force_reg (vq_mode, vq_src);
3661           return aarch64_expand_sve_dupq (target, mode, vq_src);
3662         }
3663
3664       /* Get an integer representation of the repeating part of Advanced
3665          SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
3666          which for big-endian targets is lane-swapped wrt a normal
3667          Advanced SIMD vector.  This means that for both endiannesses,
3668          memory lane N of SVE vector SRC corresponds to architectural
3669          lane N of a register holding VQ_SRC.  This in turn means that
3670          memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
3671          as a single 128-bit value) and thus that memory lane 0 of SRC is
3672          in the lsb of the integer.  Duplicating the integer therefore
3673          ensures that memory lane N of SRC goes into architectural lane
3674          N + I * INDEX of the SVE register.  */
3675       scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
3676       rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
3677       if (elt_value)
3678         {
3679           /* Pretend that we had a vector of INT_MODE to start with.  */
3680           elt_mode = int_mode;
3681           mode = aarch64_full_sve_mode (int_mode).require ();
3682
3683           /* If the integer can be moved into a general register by a
3684              single instruction, do that and duplicate the result.  */
3685           if (CONST_INT_P (elt_value)
3686               && aarch64_move_imm (INTVAL (elt_value), elt_mode))
3687             {
3688               elt_value = force_reg (elt_mode, elt_value);
3689               return expand_vector_broadcast (mode, elt_value);
3690             }
3691         }
3692       else if (npatterns == 1)
3693         /* We're duplicating a single value, but can't do better than
3694            force it to memory and load from there.  This handles things
3695            like symbolic constants.  */
3696         elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
3697
3698       if (elt_value)
3699         {
3700           /* Load the element from memory if we can, otherwise move it into
3701              a register and use a DUP.  */
3702           rtx op = force_const_mem (elt_mode, elt_value);
3703           if (!op)
3704             op = force_reg (elt_mode, elt_value);
3705           return expand_vector_broadcast (mode, op);
3706         }
3707     }
3708
3709   /* Try using INDEX.  */
3710   rtx base, step;
3711   if (const_vec_series_p (src, &base, &step))
3712     {
3713       aarch64_expand_vec_series (target, base, step);
3714       return target;
3715     }
3716
3717   /* From here on, it's better to force the whole constant to memory
3718      if we can.  */
3719   if (GET_MODE_NUNITS (mode).is_constant ())
3720     return NULL_RTX;
3721
3722   /* Expand each pattern individually.  */
3723   gcc_assert (npatterns > 1);
3724   rtx_vector_builder builder;
3725   auto_vec<rtx, 16> vectors (npatterns);
3726   for (unsigned int i = 0; i < npatterns; ++i)
3727     {
3728       builder.new_vector (mode, 1, nelts_per_pattern);
3729       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3730         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3731       vectors.quick_push (force_reg (mode, builder.build ()));
3732     }
3733
3734   /* Use permutes to interleave the separate vectors.  */
3735   while (npatterns > 1)
3736     {
3737       npatterns /= 2;
3738       for (unsigned int i = 0; i < npatterns; ++i)
3739         {
3740           rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
3741           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3742           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3743           vectors[i] = tmp;
3744         }
3745     }
3746   gcc_assert (vectors[0] == target);
3747   return target;
3748 }
3749
3750 /* Use WHILE to set a predicate register of mode MODE in which the first
3751    VL bits are set and the rest are clear.  Use TARGET for the register
3752    if it's nonnull and convenient.  */
3753
3754 static rtx
3755 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
3756                                  unsigned int vl)
3757 {
3758   rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
3759   target = aarch64_target_reg (target, mode);
3760   emit_insn (gen_while_ult (DImode, mode, target, const0_rtx, limit));
3761   return target;
3762 }
3763
3764 static rtx
3765 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
3766
3767 /* BUILDER is a constant predicate in which the index of every set bit
3768    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
3769    by inverting every element at a multiple of ELT_SIZE and EORing the
3770    result with an ELT_SIZE PTRUE.
3771
3772    Return a register that contains the constant on success, otherwise
3773    return null.  Use TARGET as the register if it is nonnull and
3774    convenient.  */
3775
3776 static rtx
3777 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
3778                                    unsigned int elt_size)
3779 {
3780   /* Invert every element at a multiple of ELT_SIZE, keeping the
3781      other bits zero.  */
3782   rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
3783                                   builder.nelts_per_pattern ());
3784   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
3785     if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
3786       inv_builder.quick_push (const1_rtx);
3787     else
3788       inv_builder.quick_push (const0_rtx);
3789   inv_builder.finalize ();
3790
3791   /* See if we can load the constant cheaply.  */
3792   rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
3793   if (!inv)
3794     return NULL_RTX;
3795
3796   /* EOR the result with an ELT_SIZE PTRUE.  */
3797   rtx mask = aarch64_ptrue_all (elt_size);
3798   mask = force_reg (VNx16BImode, mask);
3799   target = aarch64_target_reg (target, VNx16BImode);
3800   emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
3801   return target;
3802 }
3803
3804 /* BUILDER is a constant predicate in which the index of every set bit
3805    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
3806    using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE.  Return the
3807    register on success, otherwise return null.  Use TARGET as the register
3808    if nonnull and convenient.  */
3809
3810 static rtx
3811 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
3812                                    unsigned int elt_size,
3813                                    unsigned int permute_size)
3814 {
3815   /* We're going to split the constant into two new constants A and B,
3816      with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
3817      and into B otherwise.  E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
3818
3819      A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
3820      B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
3821
3822      where _ indicates elements that will be discarded by the permute.
3823
3824      First calculate the ELT_SIZEs for A and B.  */
3825   unsigned int a_elt_size = GET_MODE_SIZE (DImode);
3826   unsigned int b_elt_size = GET_MODE_SIZE (DImode);
3827   for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
3828     if (INTVAL (builder.elt (i)) != 0)
3829       {
3830         if (i & permute_size)
3831           b_elt_size |= i - permute_size;
3832         else
3833           a_elt_size |= i;
3834       }
3835   a_elt_size &= -a_elt_size;
3836   b_elt_size &= -b_elt_size;
3837
3838   /* Now construct the vectors themselves.  */
3839   rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
3840                                 builder.nelts_per_pattern ());
3841   rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
3842                                 builder.nelts_per_pattern ());
3843   unsigned int nelts = builder.encoded_nelts ();
3844   for (unsigned int i = 0; i < nelts; ++i)
3845     if (i & (elt_size - 1))
3846       {
3847         a_builder.quick_push (const0_rtx);
3848         b_builder.quick_push (const0_rtx);
3849       }
3850     else if ((i & permute_size) == 0)
3851       {
3852         /* The A and B elements are significant.  */
3853         a_builder.quick_push (builder.elt (i));
3854         b_builder.quick_push (builder.elt (i + permute_size));
3855       }
3856     else
3857       {
3858         /* The A and B elements are going to be discarded, so pick whatever
3859            is likely to give a nice constant.  We are targeting element
3860            sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
3861            with the aim of each being a sequence of ones followed by
3862            a sequence of zeros.  So:
3863
3864            * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
3865              duplicate the last X_ELT_SIZE element, to extend the
3866              current sequence of ones or zeros.
3867
3868            * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
3869              zero, so that the constant really does have X_ELT_SIZE and
3870              not a smaller size.  */
3871         if (a_elt_size > permute_size)
3872           a_builder.quick_push (const0_rtx);
3873         else
3874           a_builder.quick_push (a_builder.elt (i - a_elt_size));
3875         if (b_elt_size > permute_size)
3876           b_builder.quick_push (const0_rtx);
3877         else
3878           b_builder.quick_push (b_builder.elt (i - b_elt_size));
3879       }
3880   a_builder.finalize ();
3881   b_builder.finalize ();
3882
3883   /* Try loading A into a register.  */
3884   rtx_insn *last = get_last_insn ();
3885   rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
3886   if (!a)
3887     return NULL_RTX;
3888
3889   /* Try loading B into a register.  */
3890   rtx b = a;
3891   if (a_builder != b_builder)
3892     {
3893       b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
3894       if (!b)
3895         {
3896           delete_insns_since (last);
3897           return NULL_RTX;
3898         }
3899     }
3900
3901   /* Emit the TRN1 itself.  */
3902   machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
3903   target = aarch64_target_reg (target, mode);
3904   emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
3905                               gen_lowpart (mode, a),
3906                               gen_lowpart (mode, b)));
3907   return target;
3908 }
3909
3910 /* Subroutine of aarch64_expand_sve_const_pred.  Try to load the VNx16BI
3911    constant in BUILDER into an SVE predicate register.  Return the register
3912    on success, otherwise return null.  Use TARGET for the register if
3913    nonnull and convenient.
3914
3915    ALLOW_RECURSE_P is true if we can use methods that would call this
3916    function recursively.  */
3917
3918 static rtx
3919 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
3920                                  bool allow_recurse_p)
3921 {
3922   if (builder.encoded_nelts () == 1)
3923     /* A PFALSE or a PTRUE .B ALL.  */
3924     return aarch64_emit_set_immediate (target, builder);
3925
3926   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
3927   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
3928     {
3929       /* If we can load the constant using PTRUE, use it as-is.  */
3930       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
3931       if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
3932         return aarch64_emit_set_immediate (target, builder);
3933
3934       /* Otherwise use WHILE to set the first VL bits.  */
3935       return aarch64_sve_move_pred_via_while (target, mode, vl);
3936     }
3937
3938   if (!allow_recurse_p)
3939     return NULL_RTX;
3940
3941   /* Try inverting the vector in element size ELT_SIZE and then EORing
3942      the result with an ELT_SIZE PTRUE.  */
3943   if (INTVAL (builder.elt (0)) == 0)
3944     if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
3945                                                      elt_size))
3946       return res;
3947
3948   /* Try using TRN1 to permute two simpler constants.  */
3949   for (unsigned int i = elt_size; i <= 8; i *= 2)
3950     if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
3951                                                      elt_size, i))
3952       return res;
3953
3954   return NULL_RTX;
3955 }
3956
3957 /* Return an SVE predicate register that contains the VNx16BImode
3958    constant in BUILDER, without going through the move expanders.
3959
3960    The returned register can have whatever mode seems most natural
3961    given the contents of BUILDER.  Use TARGET for the result if
3962    convenient.  */
3963
3964 static rtx
3965 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
3966 {
3967   /* Try loading the constant using pure predicate operations.  */
3968   if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
3969     return res;
3970
3971   /* Try forcing the constant to memory.  */
3972   if (builder.full_nelts ().is_constant ())
3973     if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
3974       {
3975         target = aarch64_target_reg (target, VNx16BImode);
3976         emit_move_insn (target, mem);
3977         return target;
3978       }
3979
3980   /* The last resort is to load the constant as an integer and then
3981      compare it against zero.  Use -1 for set bits in order to increase
3982      the changes of using SVE DUPM or an Advanced SIMD byte mask.  */
3983   rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
3984                                   builder.nelts_per_pattern ());
3985   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
3986     int_builder.quick_push (INTVAL (builder.elt (i))
3987                             ? constm1_rtx : const0_rtx);
3988   return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
3989                                            int_builder.build ());
3990 }
3991
3992 /* Set DEST to immediate IMM.  */
3993
3994 void
3995 aarch64_expand_mov_immediate (rtx dest, rtx imm)
3996 {
3997   machine_mode mode = GET_MODE (dest);
3998
3999   /* Check on what type of symbol it is.  */
4000   scalar_int_mode int_mode;
4001   if ((GET_CODE (imm) == SYMBOL_REF
4002        || GET_CODE (imm) == LABEL_REF
4003        || GET_CODE (imm) == CONST
4004        || GET_CODE (imm) == CONST_POLY_INT)
4005       && is_a <scalar_int_mode> (mode, &int_mode))
4006     {
4007       rtx mem;
4008       poly_int64 offset;
4009       HOST_WIDE_INT const_offset;
4010       enum aarch64_symbol_type sty;
4011
4012       /* If we have (const (plus symbol offset)), separate out the offset
4013          before we start classifying the symbol.  */
4014       rtx base = strip_offset (imm, &offset);
4015
4016       /* We must always add an offset involving VL separately, rather than
4017          folding it into the relocation.  */
4018       if (!offset.is_constant (&const_offset))
4019         {
4020           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
4021             emit_insn (gen_rtx_SET (dest, imm));
4022           else
4023             {
4024               /* Do arithmetic on 32-bit values if the result is smaller
4025                  than that.  */
4026               if (partial_subreg_p (int_mode, SImode))
4027                 {
4028                   /* It is invalid to do symbol calculations in modes
4029                      narrower than SImode.  */
4030                   gcc_assert (base == const0_rtx);
4031                   dest = gen_lowpart (SImode, dest);
4032                   int_mode = SImode;
4033                 }
4034               if (base != const0_rtx)
4035                 {
4036                   base = aarch64_force_temporary (int_mode, dest, base);
4037                   aarch64_add_offset (int_mode, dest, base, offset,
4038                                       NULL_RTX, NULL_RTX, false);
4039                 }
4040               else
4041                 aarch64_add_offset (int_mode, dest, base, offset,
4042                                     dest, NULL_RTX, false);
4043             }
4044           return;
4045         }
4046
4047       sty = aarch64_classify_symbol (base, const_offset);
4048       switch (sty)
4049         {
4050         case SYMBOL_FORCE_TO_MEM:
4051           if (const_offset != 0
4052               && targetm.cannot_force_const_mem (int_mode, imm))
4053             {
4054               gcc_assert (can_create_pseudo_p ());
4055               base = aarch64_force_temporary (int_mode, dest, base);
4056               aarch64_add_offset (int_mode, dest, base, const_offset,
4057                                   NULL_RTX, NULL_RTX, false);
4058               return;
4059             }
4060
4061           mem = force_const_mem (ptr_mode, imm);
4062           gcc_assert (mem);
4063
4064           /* If we aren't generating PC relative literals, then
4065              we need to expand the literal pool access carefully.
4066              This is something that needs to be done in a number
4067              of places, so could well live as a separate function.  */
4068           if (!aarch64_pcrelative_literal_loads)
4069             {
4070               gcc_assert (can_create_pseudo_p ());
4071               base = gen_reg_rtx (ptr_mode);
4072               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
4073               if (ptr_mode != Pmode)
4074                 base = convert_memory_address (Pmode, base);
4075               mem = gen_rtx_MEM (ptr_mode, base);
4076             }
4077
4078           if (int_mode != ptr_mode)
4079             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
4080
4081           emit_insn (gen_rtx_SET (dest, mem));
4082
4083           return;
4084
4085         case SYMBOL_SMALL_TLSGD:
4086         case SYMBOL_SMALL_TLSDESC:
4087         case SYMBOL_SMALL_TLSIE:
4088         case SYMBOL_SMALL_GOT_28K:
4089         case SYMBOL_SMALL_GOT_4G:
4090         case SYMBOL_TINY_GOT:
4091         case SYMBOL_TINY_TLSIE:
4092           if (const_offset != 0)
4093             {
4094               gcc_assert(can_create_pseudo_p ());
4095               base = aarch64_force_temporary (int_mode, dest, base);
4096               aarch64_add_offset (int_mode, dest, base, const_offset,
4097                                   NULL_RTX, NULL_RTX, false);
4098               return;
4099             }
4100           /* FALLTHRU */
4101
4102         case SYMBOL_SMALL_ABSOLUTE:
4103         case SYMBOL_TINY_ABSOLUTE:
4104         case SYMBOL_TLSLE12:
4105         case SYMBOL_TLSLE24:
4106         case SYMBOL_TLSLE32:
4107         case SYMBOL_TLSLE48:
4108           aarch64_load_symref_appropriately (dest, imm, sty);
4109           return;
4110
4111         default:
4112           gcc_unreachable ();
4113         }
4114     }
4115
4116   if (!CONST_INT_P (imm))
4117     {
4118       if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
4119         {
4120           /* Only the low bit of each .H, .S and .D element is defined,
4121              so we can set the upper bits to whatever we like.  If the
4122              predicate is all-true in MODE, prefer to set all the undefined
4123              bits as well, so that we can share a single .B predicate for
4124              all modes.  */
4125           if (imm == CONSTM1_RTX (mode))
4126             imm = CONSTM1_RTX (VNx16BImode);
4127
4128           /* All methods for constructing predicate modes wider than VNx16BI
4129              will set the upper bits of each element to zero.  Expose this
4130              by moving such constants as a VNx16BI, so that all bits are
4131              significant and so that constants for different modes can be
4132              shared.  The wider constant will still be available as a
4133              REG_EQUAL note.  */
4134           rtx_vector_builder builder;
4135           if (aarch64_get_sve_pred_bits (builder, imm))
4136             {
4137               rtx res = aarch64_expand_sve_const_pred (dest, builder);
4138               if (dest != res)
4139                 emit_move_insn (dest, gen_lowpart (mode, res));
4140               return;
4141             }
4142         }
4143
4144       if (GET_CODE (imm) == HIGH
4145           || aarch64_simd_valid_immediate (imm, NULL))
4146         {
4147           emit_insn (gen_rtx_SET (dest, imm));
4148           return;
4149         }
4150
4151       if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
4152         if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
4153           {
4154             if (dest != res)
4155               emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
4156             return;
4157           }
4158
4159       rtx mem = force_const_mem (mode, imm);
4160       gcc_assert (mem);
4161       emit_move_insn (dest, mem);
4162       return;
4163     }
4164
4165   aarch64_internal_mov_immediate (dest, imm, true,
4166                                   as_a <scalar_int_mode> (mode));
4167 }
4168
4169 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
4170    that is known to contain PTRUE.  */
4171
4172 void
4173 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
4174 {
4175   expand_operand ops[3];
4176   machine_mode mode = GET_MODE (dest);
4177   create_output_operand (&ops[0], dest, mode);
4178   create_input_operand (&ops[1], pred, GET_MODE(pred));
4179   create_input_operand (&ops[2], src, mode);
4180   temporary_volatile_ok v (true);
4181   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
4182 }
4183
4184 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
4185    operand is in memory.  In this case we need to use the predicated LD1
4186    and ST1 instead of LDR and STR, both for correctness on big-endian
4187    targets and because LD1 and ST1 support a wider range of addressing modes.
4188    PRED_MODE is the mode of the predicate.
4189
4190    See the comment at the head of aarch64-sve.md for details about the
4191    big-endian handling.  */
4192
4193 void
4194 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
4195 {
4196   machine_mode mode = GET_MODE (dest);
4197   rtx ptrue = aarch64_ptrue_reg (pred_mode);
4198   if (!register_operand (src, mode)
4199       && !register_operand (dest, mode))
4200     {
4201       rtx tmp = gen_reg_rtx (mode);
4202       if (MEM_P (src))
4203         aarch64_emit_sve_pred_move (tmp, ptrue, src);
4204       else
4205         emit_move_insn (tmp, src);
4206       src = tmp;
4207     }
4208   aarch64_emit_sve_pred_move (dest, ptrue, src);
4209 }
4210
4211 /* Called only on big-endian targets.  See whether an SVE vector move
4212    from SRC to DEST is effectively a REV[BHW] instruction, because at
4213    least one operand is a subreg of an SVE vector that has wider or
4214    narrower elements.  Return true and emit the instruction if so.
4215
4216    For example:
4217
4218      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4219
4220    represents a VIEW_CONVERT between the following vectors, viewed
4221    in memory order:
4222
4223      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
4224      R1: { [0],      [1],      [2],      [3],     ... }
4225
4226    The high part of lane X in R2 should therefore correspond to lane X*2
4227    of R1, but the register representations are:
4228
4229          msb                                      lsb
4230      R2: ...... [1].high  [1].low   [0].high  [0].low
4231      R1: ...... [3]       [2]       [1]       [0]
4232
4233    where the low part of lane X in R2 corresponds to lane X*2 in R1.
4234    We therefore need a reverse operation to swap the high and low values
4235    around.
4236
4237    This is purely an optimization.  Without it we would spill the
4238    subreg operand to the stack in one mode and reload it in the
4239    other mode, which has the same effect as the REV.  */
4240
4241 bool
4242 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
4243 {
4244   gcc_assert (BYTES_BIG_ENDIAN);
4245   if (GET_CODE (dest) == SUBREG)
4246     dest = SUBREG_REG (dest);
4247   if (GET_CODE (src) == SUBREG)
4248     src = SUBREG_REG (src);
4249
4250   /* The optimization handles two single SVE REGs with different element
4251      sizes.  */
4252   if (!REG_P (dest)
4253       || !REG_P (src)
4254       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
4255       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
4256       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
4257           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
4258     return false;
4259
4260   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
4261   rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
4262   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
4263                                UNSPEC_REV_SUBREG);
4264   emit_insn (gen_rtx_SET (dest, unspec));
4265   return true;
4266 }
4267
4268 /* Return a copy of X with mode MODE, without changing its other
4269    attributes.  Unlike gen_lowpart, this doesn't care whether the
4270    mode change is valid.  */
4271
4272 static rtx
4273 aarch64_replace_reg_mode (rtx x, machine_mode mode)
4274 {
4275   if (GET_MODE (x) == mode)
4276     return x;
4277
4278   x = shallow_copy_rtx (x);
4279   set_mode_and_regno (x, mode, REGNO (x));
4280   return x;
4281 }
4282
4283 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4284    operands.  */
4285
4286 void
4287 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
4288 {
4289   /* Decide which REV operation we need.  The mode with narrower elements
4290      determines the mode of the operands and the mode with the wider
4291      elements determines the reverse width.  */
4292   machine_mode mode_with_wider_elts = GET_MODE (dest);
4293   machine_mode mode_with_narrower_elts = GET_MODE (src);
4294   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
4295       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
4296     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
4297
4298   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
4299   unsigned int unspec;
4300   if (wider_bytes == 8)
4301     unspec = UNSPEC_REV64;
4302   else if (wider_bytes == 4)
4303     unspec = UNSPEC_REV32;
4304   else if (wider_bytes == 2)
4305     unspec = UNSPEC_REV16;
4306   else
4307     gcc_unreachable ();
4308   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
4309
4310   /* Emit:
4311
4312        (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)] UNSPEC_PRED_X))
4313
4314      with the appropriate modes.  */
4315   ptrue = gen_lowpart (pred_mode, ptrue);
4316   dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
4317   src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
4318   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
4319   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
4320                         UNSPEC_PRED_X);
4321   emit_insn (gen_rtx_SET (dest, src));
4322 }
4323
4324 static bool
4325 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
4326                                  tree exp ATTRIBUTE_UNUSED)
4327 {
4328   if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
4329     return false;
4330
4331   return true;
4332 }
4333
4334 /* Implement TARGET_PASS_BY_REFERENCE.  */
4335
4336 static bool
4337 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
4338                            machine_mode mode,
4339                            const_tree type,
4340                            bool named ATTRIBUTE_UNUSED)
4341 {
4342   HOST_WIDE_INT size;
4343   machine_mode dummymode;
4344   int nregs;
4345
4346   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
4347   if (mode == BLKmode && type)
4348     size = int_size_in_bytes (type);
4349   else
4350     /* No frontends can create types with variable-sized modes, so we
4351        shouldn't be asked to pass or return them.  */
4352     size = GET_MODE_SIZE (mode).to_constant ();
4353
4354   /* Aggregates are passed by reference based on their size.  */
4355   if (type && AGGREGATE_TYPE_P (type))
4356     {
4357       size = int_size_in_bytes (type);
4358     }
4359
4360   /* Variable sized arguments are always returned by reference.  */
4361   if (size < 0)
4362     return true;
4363
4364   /* Can this be a candidate to be passed in fp/simd register(s)?  */
4365   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4366                                                &dummymode, &nregs,
4367                                                NULL))
4368     return false;
4369
4370   /* Arguments which are variable sized or larger than 2 registers are
4371      passed by reference unless they are a homogenous floating point
4372      aggregate.  */
4373   return size > 2 * UNITS_PER_WORD;
4374 }
4375
4376 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
4377 static bool
4378 aarch64_return_in_msb (const_tree valtype)
4379 {
4380   machine_mode dummy_mode;
4381   int dummy_int;
4382
4383   /* Never happens in little-endian mode.  */
4384   if (!BYTES_BIG_ENDIAN)
4385     return false;
4386
4387   /* Only composite types smaller than or equal to 16 bytes can
4388      be potentially returned in registers.  */
4389   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
4390       || int_size_in_bytes (valtype) <= 0
4391       || int_size_in_bytes (valtype) > 16)
4392     return false;
4393
4394   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4395      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4396      is always passed/returned in the least significant bits of fp/simd
4397      register(s).  */
4398   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
4399                                                &dummy_mode, &dummy_int, NULL))
4400     return false;
4401
4402   return true;
4403 }
4404
4405 /* Implement TARGET_FUNCTION_VALUE.
4406    Define how to find the value returned by a function.  */
4407
4408 static rtx
4409 aarch64_function_value (const_tree type, const_tree func,
4410                         bool outgoing ATTRIBUTE_UNUSED)
4411 {
4412   machine_mode mode;
4413   int unsignedp;
4414   int count;
4415   machine_mode ag_mode;
4416
4417   mode = TYPE_MODE (type);
4418   if (INTEGRAL_TYPE_P (type))
4419     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
4420
4421   if (aarch64_return_in_msb (type))
4422     {
4423       HOST_WIDE_INT size = int_size_in_bytes (type);
4424
4425       if (size % UNITS_PER_WORD != 0)
4426         {
4427           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
4428           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
4429         }
4430     }
4431
4432   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4433                                                &ag_mode, &count, NULL))
4434     {
4435       if (!aarch64_composite_type_p (type, mode))
4436         {
4437           gcc_assert (count == 1 && mode == ag_mode);
4438           return gen_rtx_REG (mode, V0_REGNUM);
4439         }
4440       else
4441         {
4442           int i;
4443           rtx par;
4444
4445           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
4446           for (i = 0; i < count; i++)
4447             {
4448               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
4449               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
4450               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4451               XVECEXP (par, 0, i) = tmp;
4452             }
4453           return par;
4454         }
4455     }
4456   else
4457     return gen_rtx_REG (mode, R0_REGNUM);
4458 }
4459
4460 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
4461    Return true if REGNO is the number of a hard register in which the values
4462    of called function may come back.  */
4463
4464 static bool
4465 aarch64_function_value_regno_p (const unsigned int regno)
4466 {
4467   /* Maximum of 16 bytes can be returned in the general registers.  Examples
4468      of 16-byte return values are: 128-bit integers and 16-byte small
4469      structures (excluding homogeneous floating-point aggregates).  */
4470   if (regno == R0_REGNUM || regno == R1_REGNUM)
4471     return true;
4472
4473   /* Up to four fp/simd registers can return a function value, e.g. a
4474      homogeneous floating-point aggregate having four members.  */
4475   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
4476     return TARGET_FLOAT;
4477
4478   return false;
4479 }
4480
4481 /* Implement TARGET_RETURN_IN_MEMORY.
4482
4483    If the type T of the result of a function is such that
4484      void func (T arg)
4485    would require that arg be passed as a value in a register (or set of
4486    registers) according to the parameter passing rules, then the result
4487    is returned in the same registers as would be used for such an
4488    argument.  */
4489
4490 static bool
4491 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
4492 {
4493   HOST_WIDE_INT size;
4494   machine_mode ag_mode;
4495   int count;
4496
4497   if (!AGGREGATE_TYPE_P (type)
4498       && TREE_CODE (type) != COMPLEX_TYPE
4499       && TREE_CODE (type) != VECTOR_TYPE)
4500     /* Simple scalar types always returned in registers.  */
4501     return false;
4502
4503   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
4504                                                type,
4505                                                &ag_mode,
4506                                                &count,
4507                                                NULL))
4508     return false;
4509
4510   /* Types larger than 2 registers returned in memory.  */
4511   size = int_size_in_bytes (type);
4512   return (size < 0 || size > 2 * UNITS_PER_WORD);
4513 }
4514
4515 static bool
4516 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
4517                                const_tree type, int *nregs)
4518 {
4519   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4520   return aarch64_vfp_is_call_or_return_candidate (mode,
4521                                                   type,
4522                                                   &pcum->aapcs_vfp_rmode,
4523                                                   nregs,
4524                                                   NULL);
4525 }
4526
4527 /* Given MODE and TYPE of a function argument, return the alignment in
4528    bits.  The idea is to suppress any stronger alignment requested by
4529    the user and opt for the natural alignment (specified in AAPCS64 \S
4530    4.1).  ABI_BREAK is set to true if the alignment was incorrectly
4531    calculated in versions of GCC prior to GCC-9.  This is a helper
4532    function for local use only.  */
4533
4534 static unsigned int
4535 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
4536                                 bool *abi_break)
4537 {
4538   *abi_break = false;
4539   if (!type)
4540     return GET_MODE_ALIGNMENT (mode);
4541
4542   if (integer_zerop (TYPE_SIZE (type)))
4543     return 0;
4544
4545   gcc_assert (TYPE_MODE (type) == mode);
4546
4547   if (!AGGREGATE_TYPE_P (type))
4548     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
4549
4550   if (TREE_CODE (type) == ARRAY_TYPE)
4551     return TYPE_ALIGN (TREE_TYPE (type));
4552
4553   unsigned int alignment = 0;
4554   unsigned int bitfield_alignment = 0;
4555   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
4556     if (TREE_CODE (field) == FIELD_DECL)
4557       {
4558         alignment = std::max (alignment, DECL_ALIGN (field));
4559         if (DECL_BIT_FIELD_TYPE (field))
4560           bitfield_alignment
4561             = std::max (bitfield_alignment,
4562                         TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
4563       }
4564
4565   if (bitfield_alignment > alignment)
4566     {
4567       *abi_break = true;
4568       return bitfield_alignment;
4569     }
4570
4571   return alignment;
4572 }
4573
4574 /* Layout a function argument according to the AAPCS64 rules.  The rule
4575    numbers refer to the rule numbers in the AAPCS64.  */
4576
4577 static void
4578 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
4579                     const_tree type,
4580                     bool named ATTRIBUTE_UNUSED)
4581 {
4582   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4583   int ncrn, nvrn, nregs;
4584   bool allocate_ncrn, allocate_nvrn;
4585   HOST_WIDE_INT size;
4586   bool abi_break;
4587
4588   /* We need to do this once per argument.  */
4589   if (pcum->aapcs_arg_processed)
4590     return;
4591
4592   pcum->aapcs_arg_processed = true;
4593
4594   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
4595   if (type)
4596     size = int_size_in_bytes (type);
4597   else
4598     /* No frontends can create types with variable-sized modes, so we
4599        shouldn't be asked to pass or return them.  */
4600     size = GET_MODE_SIZE (mode).to_constant ();
4601   size = ROUND_UP (size, UNITS_PER_WORD);
4602
4603   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
4604   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
4605                                                  mode,
4606                                                  type,
4607                                                  &nregs);
4608
4609   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
4610      The following code thus handles passing by SIMD/FP registers first.  */
4611
4612   nvrn = pcum->aapcs_nvrn;
4613
4614   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
4615      and homogenous short-vector aggregates (HVA).  */
4616   if (allocate_nvrn)
4617     {
4618       if (!TARGET_FLOAT)
4619         aarch64_err_no_fpadvsimd (mode);
4620
4621       if (nvrn + nregs <= NUM_FP_ARG_REGS)
4622         {
4623           pcum->aapcs_nextnvrn = nvrn + nregs;
4624           if (!aarch64_composite_type_p (type, mode))
4625             {
4626               gcc_assert (nregs == 1);
4627               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
4628             }
4629           else
4630             {
4631               rtx par;
4632               int i;
4633               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4634               for (i = 0; i < nregs; i++)
4635                 {
4636                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
4637                                          V0_REGNUM + nvrn + i);
4638                   rtx offset = gen_int_mode
4639                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
4640                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4641                   XVECEXP (par, 0, i) = tmp;
4642                 }
4643               pcum->aapcs_reg = par;
4644             }
4645           return;
4646         }
4647       else
4648         {
4649           /* C.3 NSRN is set to 8.  */
4650           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
4651           goto on_stack;
4652         }
4653     }
4654
4655   ncrn = pcum->aapcs_ncrn;
4656   nregs = size / UNITS_PER_WORD;
4657
4658   /* C6 - C9.  though the sign and zero extension semantics are
4659      handled elsewhere.  This is the case where the argument fits
4660      entirely general registers.  */
4661   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
4662     {
4663       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
4664
4665       /* C.8 if the argument has an alignment of 16 then the NGRN is
4666          rounded up to the next even number.  */
4667       if (nregs == 2
4668           && ncrn % 2
4669           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
4670              comparison is there because for > 16 * BITS_PER_UNIT
4671              alignment nregs should be > 2 and therefore it should be
4672              passed by reference rather than value.  */
4673           && (aarch64_function_arg_alignment (mode, type, &abi_break)
4674               == 16 * BITS_PER_UNIT))
4675         {
4676           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4677             inform (input_location, "parameter passing for argument of type "
4678                     "%qT changed in GCC 9.1", type);
4679           ++ncrn;
4680           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
4681         }
4682
4683       /* NREGS can be 0 when e.g. an empty structure is to be passed.
4684          A reg is still generated for it, but the caller should be smart
4685          enough not to use it.  */
4686       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
4687         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
4688       else
4689         {
4690           rtx par;
4691           int i;
4692
4693           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4694           for (i = 0; i < nregs; i++)
4695             {
4696               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
4697               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
4698                                        GEN_INT (i * UNITS_PER_WORD));
4699               XVECEXP (par, 0, i) = tmp;
4700             }
4701           pcum->aapcs_reg = par;
4702         }
4703
4704       pcum->aapcs_nextncrn = ncrn + nregs;
4705       return;
4706     }
4707
4708   /* C.11  */
4709   pcum->aapcs_nextncrn = NUM_ARG_REGS;
4710
4711   /* The argument is passed on stack; record the needed number of words for
4712      this argument and align the total size if necessary.  */
4713 on_stack:
4714   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
4715
4716   if (aarch64_function_arg_alignment (mode, type, &abi_break)
4717       == 16 * BITS_PER_UNIT)
4718     {
4719       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
4720       if (pcum->aapcs_stack_size != new_size)
4721         {
4722           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4723             inform (input_location, "parameter passing for argument of type "
4724                     "%qT changed in GCC 9.1", type);
4725           pcum->aapcs_stack_size = new_size;
4726         }
4727     }
4728   return;
4729 }
4730
4731 /* Implement TARGET_FUNCTION_ARG.  */
4732
4733 static rtx
4734 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
4735                       const_tree type, bool named)
4736 {
4737   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4738   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
4739
4740   if (mode == VOIDmode)
4741     return NULL_RTX;
4742
4743   aarch64_layout_arg (pcum_v, mode, type, named);
4744   return pcum->aapcs_reg;
4745 }
4746
4747 void
4748 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
4749                            const_tree fntype ATTRIBUTE_UNUSED,
4750                            rtx libname ATTRIBUTE_UNUSED,
4751                            const_tree fndecl ATTRIBUTE_UNUSED,
4752                            unsigned n_named ATTRIBUTE_UNUSED)
4753 {
4754   pcum->aapcs_ncrn = 0;
4755   pcum->aapcs_nvrn = 0;
4756   pcum->aapcs_nextncrn = 0;
4757   pcum->aapcs_nextnvrn = 0;
4758   pcum->pcs_variant = ARM_PCS_AAPCS64;
4759   pcum->aapcs_reg = NULL_RTX;
4760   pcum->aapcs_arg_processed = false;
4761   pcum->aapcs_stack_words = 0;
4762   pcum->aapcs_stack_size = 0;
4763
4764   if (!TARGET_FLOAT
4765       && fndecl && TREE_PUBLIC (fndecl)
4766       && fntype && fntype != error_mark_node)
4767     {
4768       const_tree type = TREE_TYPE (fntype);
4769       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
4770       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
4771       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
4772                                                    &mode, &nregs, NULL))
4773         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
4774     }
4775   return;
4776 }
4777
4778 static void
4779 aarch64_function_arg_advance (cumulative_args_t pcum_v,
4780                               machine_mode mode,
4781                               const_tree type,
4782                               bool named)
4783 {
4784   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4785   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
4786     {
4787       aarch64_layout_arg (pcum_v, mode, type, named);
4788       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
4789                   != (pcum->aapcs_stack_words != 0));
4790       pcum->aapcs_arg_processed = false;
4791       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4792       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4793       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4794       pcum->aapcs_stack_words = 0;
4795       pcum->aapcs_reg = NULL_RTX;
4796     }
4797 }
4798
4799 bool
4800 aarch64_function_arg_regno_p (unsigned regno)
4801 {
4802   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4803           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4804 }
4805
4806 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
4807    PARM_BOUNDARY bits of alignment, but will be given anything up
4808    to STACK_BOUNDARY bits if the type requires it.  This makes sure
4809    that both before and after the layout of each argument, the Next
4810    Stacked Argument Address (NSAA) will have a minimum alignment of
4811    8 bytes.  */
4812
4813 static unsigned int
4814 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4815 {
4816   bool abi_break;
4817   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
4818                                                            &abi_break);
4819   if (abi_break & warn_psabi)
4820     inform (input_location, "parameter passing for argument of type "
4821             "%qT changed in GCC 9.1", type);
4822
4823   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4824 }
4825
4826 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
4827
4828 static fixed_size_mode
4829 aarch64_get_reg_raw_mode (int regno)
4830 {
4831   if (TARGET_SVE && FP_REGNUM_P (regno))
4832     /* Don't use the SVE part of the register for __builtin_apply and
4833        __builtin_return.  The SVE registers aren't used by the normal PCS,
4834        so using them there would be a waste of time.  The PCS extensions
4835        for SVE types are fundamentally incompatible with the
4836        __builtin_return/__builtin_apply interface.  */
4837     return as_a <fixed_size_mode> (V16QImode);
4838   return default_get_reg_raw_mode (regno);
4839 }
4840
4841 /* Implement TARGET_FUNCTION_ARG_PADDING.
4842
4843    Small aggregate types are placed in the lowest memory address.
4844
4845    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
4846
4847 static pad_direction
4848 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4849 {
4850   /* On little-endian targets, the least significant byte of every stack
4851      argument is passed at the lowest byte address of the stack slot.  */
4852   if (!BYTES_BIG_ENDIAN)
4853     return PAD_UPWARD;
4854
4855   /* Otherwise, integral, floating-point and pointer types are padded downward:
4856      the least significant byte of a stack argument is passed at the highest
4857      byte address of the stack slot.  */
4858   if (type
4859       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4860          || POINTER_TYPE_P (type))
4861       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4862     return PAD_DOWNWARD;
4863
4864   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
4865   return PAD_UPWARD;
4866 }
4867
4868 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4869
4870    It specifies padding for the last (may also be the only)
4871    element of a block move between registers and memory.  If
4872    assuming the block is in the memory, padding upward means that
4873    the last element is padded after its highest significant byte,
4874    while in downward padding, the last element is padded at the
4875    its least significant byte side.
4876
4877    Small aggregates and small complex types are always padded
4878    upwards.
4879
4880    We don't need to worry about homogeneous floating-point or
4881    short-vector aggregates; their move is not affected by the
4882    padding direction determined here.  Regardless of endianness,
4883    each element of such an aggregate is put in the least
4884    significant bits of a fp/simd register.
4885
4886    Return !BYTES_BIG_ENDIAN if the least significant byte of the
4887    register has useful data, and return the opposite if the most
4888    significant byte does.  */
4889
4890 bool
4891 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4892                      bool first ATTRIBUTE_UNUSED)
4893 {
4894
4895   /* Small composite types are always padded upward.  */
4896   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4897     {
4898       HOST_WIDE_INT size;
4899       if (type)
4900         size = int_size_in_bytes (type);
4901       else
4902         /* No frontends can create types with variable-sized modes, so we
4903            shouldn't be asked to pass or return them.  */
4904         size = GET_MODE_SIZE (mode).to_constant ();
4905       if (size < 2 * UNITS_PER_WORD)
4906         return true;
4907     }
4908
4909   /* Otherwise, use the default padding.  */
4910   return !BYTES_BIG_ENDIAN;
4911 }
4912
4913 static scalar_int_mode
4914 aarch64_libgcc_cmp_return_mode (void)
4915 {
4916   return SImode;
4917 }
4918
4919 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4920
4921 /* We use the 12-bit shifted immediate arithmetic instructions so values
4922    must be multiple of (1 << 12), i.e. 4096.  */
4923 #define ARITH_FACTOR 4096
4924
4925 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4926 #error Cannot use simple address calculation for stack probing
4927 #endif
4928
4929 /* The pair of scratch registers used for stack probing.  */
4930 #define PROBE_STACK_FIRST_REG  R9_REGNUM
4931 #define PROBE_STACK_SECOND_REG R10_REGNUM
4932
4933 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4934    inclusive.  These are offsets from the current stack pointer.  */
4935
4936 static void
4937 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
4938 {
4939   HOST_WIDE_INT size;
4940   if (!poly_size.is_constant (&size))
4941     {
4942       sorry ("stack probes for SVE frames");
4943       return;
4944     }
4945
4946   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
4947
4948   /* See the same assertion on PROBE_INTERVAL above.  */
4949   gcc_assert ((first % ARITH_FACTOR) == 0);
4950
4951   /* See if we have a constant small number of probes to generate.  If so,
4952      that's the easy case.  */
4953   if (size <= PROBE_INTERVAL)
4954     {
4955       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
4956
4957       emit_set_insn (reg1,
4958                      plus_constant (Pmode,
4959                                     stack_pointer_rtx, -(first + base)));
4960       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
4961     }
4962
4963   /* The run-time loop is made up of 8 insns in the generic case while the
4964      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
4965   else if (size <= 4 * PROBE_INTERVAL)
4966     {
4967       HOST_WIDE_INT i, rem;
4968
4969       emit_set_insn (reg1,
4970                      plus_constant (Pmode,
4971                                     stack_pointer_rtx,
4972                                     -(first + PROBE_INTERVAL)));
4973       emit_stack_probe (reg1);
4974
4975       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
4976          it exceeds SIZE.  If only two probes are needed, this will not
4977          generate any code.  Then probe at FIRST + SIZE.  */
4978       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
4979         {
4980           emit_set_insn (reg1,
4981                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
4982           emit_stack_probe (reg1);
4983         }
4984
4985       rem = size - (i - PROBE_INTERVAL);
4986       if (rem > 256)
4987         {
4988           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4989
4990           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
4991           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
4992         }
4993       else
4994         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
4995     }
4996
4997   /* Otherwise, do the same as above, but in a loop.  Note that we must be
4998      extra careful with variables wrapping around because we might be at
4999      the very top (or the very bottom) of the address space and we have
5000      to be able to handle this case properly; in particular, we use an
5001      equality test for the loop condition.  */
5002   else
5003     {
5004       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
5005
5006       /* Step 1: round SIZE to the previous multiple of the interval.  */
5007
5008       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
5009
5010
5011       /* Step 2: compute initial and final value of the loop counter.  */
5012
5013       /* TEST_ADDR = SP + FIRST.  */
5014       emit_set_insn (reg1,
5015                      plus_constant (Pmode, stack_pointer_rtx, -first));
5016
5017       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
5018       HOST_WIDE_INT adjustment = - (first + rounded_size);
5019       if (! aarch64_uimm12_shift (adjustment))
5020         {
5021           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
5022                                           true, Pmode);
5023           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
5024         }
5025       else
5026         emit_set_insn (reg2,
5027                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
5028
5029       /* Step 3: the loop
5030
5031          do
5032            {
5033              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
5034              probe at TEST_ADDR
5035            }
5036          while (TEST_ADDR != LAST_ADDR)
5037
5038          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
5039          until it is equal to ROUNDED_SIZE.  */
5040
5041       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
5042
5043
5044       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
5045          that SIZE is equal to ROUNDED_SIZE.  */
5046
5047       if (size != rounded_size)
5048         {
5049           HOST_WIDE_INT rem = size - rounded_size;
5050
5051           if (rem > 256)
5052             {
5053               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5054
5055               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
5056               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
5057             }
5058           else
5059             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
5060         }
5061     }
5062
5063   /* Make sure nothing is scheduled before we are done.  */
5064   emit_insn (gen_blockage ());
5065 }
5066
5067 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
5068    absolute addresses.  */
5069
5070 const char *
5071 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
5072 {
5073   static int labelno = 0;
5074   char loop_lab[32];
5075   rtx xops[2];
5076
5077   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
5078
5079   /* Loop.  */
5080   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
5081
5082   HOST_WIDE_INT stack_clash_probe_interval
5083     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5084
5085   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
5086   xops[0] = reg1;
5087   HOST_WIDE_INT interval;
5088   if (flag_stack_clash_protection)
5089     interval = stack_clash_probe_interval;
5090   else
5091     interval = PROBE_INTERVAL;
5092
5093   gcc_assert (aarch64_uimm12_shift (interval));
5094   xops[1] = GEN_INT (interval);
5095
5096   output_asm_insn ("sub\t%0, %0, %1", xops);
5097
5098   /* If doing stack clash protection then we probe up by the ABI specified
5099      amount.  We do this because we're dropping full pages at a time in the
5100      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
5101   if (flag_stack_clash_protection)
5102     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
5103   else
5104     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
5105
5106   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
5107      by this amount for each iteration.  */
5108   output_asm_insn ("str\txzr, [%0, %1]", xops);
5109
5110   /* Test if TEST_ADDR == LAST_ADDR.  */
5111   xops[1] = reg2;
5112   output_asm_insn ("cmp\t%0, %1", xops);
5113
5114   /* Branch.  */
5115   fputs ("\tb.ne\t", asm_out_file);
5116   assemble_name_raw (asm_out_file, loop_lab);
5117   fputc ('\n', asm_out_file);
5118
5119   return "";
5120 }
5121
5122 /* Emit the probe loop for doing stack clash probes and stack adjustments for
5123    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
5124    of GUARD_SIZE.  When a probe is emitted it is done at most
5125    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
5126    at most MIN_PROBE_THRESHOLD.  By the end of this function
5127    BASE = BASE - ADJUSTMENT.  */
5128
5129 const char *
5130 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
5131                                       rtx min_probe_threshold, rtx guard_size)
5132 {
5133   /* This function is not allowed to use any instruction generation function
5134      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
5135      so instead emit the code you want using output_asm_insn.  */
5136   gcc_assert (flag_stack_clash_protection);
5137   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
5138   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
5139
5140   /* The minimum required allocation before the residual requires probing.  */
5141   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
5142
5143   /* Clamp the value down to the nearest value that can be used with a cmp.  */
5144   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
5145   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
5146
5147   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
5148   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
5149
5150   static int labelno = 0;
5151   char loop_start_lab[32];
5152   char loop_end_lab[32];
5153   rtx xops[2];
5154
5155   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
5156   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
5157
5158   /* Emit loop start label.  */
5159   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
5160
5161   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
5162   xops[0] = adjustment;
5163   xops[1] = probe_offset_value_rtx;
5164   output_asm_insn ("cmp\t%0, %1", xops);
5165
5166   /* Branch to end if not enough adjustment to probe.  */
5167   fputs ("\tb.lt\t", asm_out_file);
5168   assemble_name_raw (asm_out_file, loop_end_lab);
5169   fputc ('\n', asm_out_file);
5170
5171   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
5172   xops[0] = base;
5173   xops[1] = probe_offset_value_rtx;
5174   output_asm_insn ("sub\t%0, %0, %1", xops);
5175
5176   /* Probe at BASE.  */
5177   xops[1] = const0_rtx;
5178   output_asm_insn ("str\txzr, [%0, %1]", xops);
5179
5180   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
5181   xops[0] = adjustment;
5182   xops[1] = probe_offset_value_rtx;
5183   output_asm_insn ("sub\t%0, %0, %1", xops);
5184
5185   /* Branch to start if still more bytes to allocate.  */
5186   fputs ("\tb\t", asm_out_file);
5187   assemble_name_raw (asm_out_file, loop_start_lab);
5188   fputc ('\n', asm_out_file);
5189
5190   /* No probe leave.  */
5191   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
5192
5193   /* BASE = BASE - ADJUSTMENT.  */
5194   xops[0] = base;
5195   xops[1] = adjustment;
5196   output_asm_insn ("sub\t%0, %0, %1", xops);
5197   return "";
5198 }
5199
5200 /* Determine whether a frame chain needs to be generated.  */
5201 static bool
5202 aarch64_needs_frame_chain (void)
5203 {
5204   /* Force a frame chain for EH returns so the return address is at FP+8.  */
5205   if (frame_pointer_needed || crtl->calls_eh_return)
5206     return true;
5207
5208   /* A leaf function cannot have calls or write LR.  */
5209   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
5210
5211   /* Don't use a frame chain in leaf functions if leaf frame pointers
5212      are disabled.  */
5213   if (flag_omit_leaf_frame_pointer && is_leaf)
5214     return false;
5215
5216   return aarch64_use_frame_pointer;
5217 }
5218
5219 /* Mark the registers that need to be saved by the callee and calculate
5220    the size of the callee-saved registers area and frame record (both FP
5221    and LR may be omitted).  */
5222 static void
5223 aarch64_layout_frame (void)
5224 {
5225   HOST_WIDE_INT offset = 0;
5226   int regno, last_fp_reg = INVALID_REGNUM;
5227   bool simd_function = aarch64_simd_decl_p (cfun->decl);
5228
5229   cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
5230
5231   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
5232      the mid-end is doing.  */
5233   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
5234
5235 #define SLOT_NOT_REQUIRED (-2)
5236 #define SLOT_REQUIRED     (-1)
5237
5238   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
5239   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
5240
5241   /* If this is a non-leaf simd function with calls we assume that
5242      at least one of those calls is to a non-simd function and thus
5243      we must save V8 to V23 in the prologue.  */
5244
5245   if (simd_function && !crtl->is_leaf)
5246     {
5247       for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5248         if (FP_SIMD_SAVED_REGNUM_P (regno))
5249           df_set_regs_ever_live (regno, true);
5250     }
5251
5252   /* First mark all the registers that really need to be saved...  */
5253   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5254     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5255
5256   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5257     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5258
5259   /* ... that includes the eh data registers (if needed)...  */
5260   if (crtl->calls_eh_return)
5261     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
5262       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
5263         = SLOT_REQUIRED;
5264
5265   /* ... and any callee saved register that dataflow says is live.  */
5266   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5267     if (df_regs_ever_live_p (regno)
5268         && (regno == R30_REGNUM
5269             || !call_used_regs[regno]))
5270       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5271
5272   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5273     if (df_regs_ever_live_p (regno)
5274         && (!call_used_regs[regno]
5275             || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
5276       {
5277         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5278         last_fp_reg = regno;
5279       }
5280
5281   if (cfun->machine->frame.emit_frame_chain)
5282     {
5283       /* FP and LR are placed in the linkage record.  */
5284       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
5285       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
5286       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
5287       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
5288       offset = 2 * UNITS_PER_WORD;
5289     }
5290
5291   /* With stack-clash, LR must be saved in non-leaf functions.  */
5292   gcc_assert (crtl->is_leaf
5293               || (cfun->machine->frame.reg_offset[R30_REGNUM]
5294                   != SLOT_NOT_REQUIRED));
5295
5296   /* Now assign stack slots for them.  */
5297   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5298     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5299       {
5300         cfun->machine->frame.reg_offset[regno] = offset;
5301         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5302           cfun->machine->frame.wb_candidate1 = regno;
5303         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
5304           cfun->machine->frame.wb_candidate2 = regno;
5305         offset += UNITS_PER_WORD;
5306       }
5307
5308   HOST_WIDE_INT max_int_offset = offset;
5309   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5310   bool has_align_gap = offset != max_int_offset;
5311
5312   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5313     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5314       {
5315         /* If there is an alignment gap between integer and fp callee-saves,
5316            allocate the last fp register to it if possible.  */
5317         if (regno == last_fp_reg
5318             && has_align_gap
5319             && !simd_function
5320             && (offset & 8) == 0)
5321           {
5322             cfun->machine->frame.reg_offset[regno] = max_int_offset;
5323             break;
5324           }
5325
5326         cfun->machine->frame.reg_offset[regno] = offset;
5327         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5328           cfun->machine->frame.wb_candidate1 = regno;
5329         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
5330                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
5331           cfun->machine->frame.wb_candidate2 = regno;
5332         offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
5333       }
5334
5335   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5336
5337   cfun->machine->frame.saved_regs_size = offset;
5338
5339   HOST_WIDE_INT varargs_and_saved_regs_size
5340     = offset + cfun->machine->frame.saved_varargs_size;
5341
5342   cfun->machine->frame.hard_fp_offset
5343     = aligned_upper_bound (varargs_and_saved_regs_size
5344                            + get_frame_size (),
5345                            STACK_BOUNDARY / BITS_PER_UNIT);
5346
5347   /* Both these values are already aligned.  */
5348   gcc_assert (multiple_p (crtl->outgoing_args_size,
5349                           STACK_BOUNDARY / BITS_PER_UNIT));
5350   cfun->machine->frame.frame_size
5351     = (cfun->machine->frame.hard_fp_offset
5352        + crtl->outgoing_args_size);
5353
5354   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
5355
5356   cfun->machine->frame.initial_adjust = 0;
5357   cfun->machine->frame.final_adjust = 0;
5358   cfun->machine->frame.callee_adjust = 0;
5359   cfun->machine->frame.callee_offset = 0;
5360
5361   HOST_WIDE_INT max_push_offset = 0;
5362   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
5363     max_push_offset = 512;
5364   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
5365     max_push_offset = 256;
5366
5367   HOST_WIDE_INT const_size, const_fp_offset;
5368   if (cfun->machine->frame.frame_size.is_constant (&const_size)
5369       && const_size < max_push_offset
5370       && known_eq (crtl->outgoing_args_size, 0))
5371     {
5372       /* Simple, small frame with no outgoing arguments:
5373          stp reg1, reg2, [sp, -frame_size]!
5374          stp reg3, reg4, [sp, 16]  */
5375       cfun->machine->frame.callee_adjust = const_size;
5376     }
5377   else if (known_lt (crtl->outgoing_args_size
5378                      + cfun->machine->frame.saved_regs_size, 512)
5379            && !(cfun->calls_alloca
5380                 && known_lt (cfun->machine->frame.hard_fp_offset,
5381                              max_push_offset)))
5382     {
5383       /* Frame with small outgoing arguments:
5384          sub sp, sp, frame_size
5385          stp reg1, reg2, [sp, outgoing_args_size]
5386          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
5387       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
5388       cfun->machine->frame.callee_offset
5389         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
5390     }
5391   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
5392            && const_fp_offset < max_push_offset)
5393     {
5394       /* Frame with large outgoing arguments but a small local area:
5395          stp reg1, reg2, [sp, -hard_fp_offset]!
5396          stp reg3, reg4, [sp, 16]
5397          sub sp, sp, outgoing_args_size  */
5398       cfun->machine->frame.callee_adjust = const_fp_offset;
5399       cfun->machine->frame.final_adjust
5400         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
5401     }
5402   else
5403     {
5404       /* Frame with large local area and outgoing arguments using frame pointer:
5405          sub sp, sp, hard_fp_offset
5406          stp x29, x30, [sp, 0]
5407          add x29, sp, 0
5408          stp reg3, reg4, [sp, 16]
5409          sub sp, sp, outgoing_args_size  */
5410       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
5411       cfun->machine->frame.final_adjust
5412         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
5413     }
5414
5415   cfun->machine->frame.laid_out = true;
5416 }
5417
5418 /* Return true if the register REGNO is saved on entry to
5419    the current function.  */
5420
5421 static bool
5422 aarch64_register_saved_on_entry (int regno)
5423 {
5424   return cfun->machine->frame.reg_offset[regno] >= 0;
5425 }
5426
5427 /* Return the next register up from REGNO up to LIMIT for the callee
5428    to save.  */
5429
5430 static unsigned
5431 aarch64_next_callee_save (unsigned regno, unsigned limit)
5432 {
5433   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
5434     regno ++;
5435   return regno;
5436 }
5437
5438 /* Push the register number REGNO of mode MODE to the stack with write-back
5439    adjusting the stack by ADJUSTMENT.  */
5440
5441 static void
5442 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
5443                            HOST_WIDE_INT adjustment)
5444  {
5445   rtx base_rtx = stack_pointer_rtx;
5446   rtx insn, reg, mem;
5447
5448   reg = gen_rtx_REG (mode, regno);
5449   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
5450                             plus_constant (Pmode, base_rtx, -adjustment));
5451   mem = gen_frame_mem (mode, mem);
5452
5453   insn = emit_move_insn (mem, reg);
5454   RTX_FRAME_RELATED_P (insn) = 1;
5455 }
5456
5457 /* Generate and return an instruction to store the pair of registers
5458    REG and REG2 of mode MODE to location BASE with write-back adjusting
5459    the stack location BASE by ADJUSTMENT.  */
5460
5461 static rtx
5462 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5463                           HOST_WIDE_INT adjustment)
5464 {
5465   switch (mode)
5466     {
5467     case E_DImode:
5468       return gen_storewb_pairdi_di (base, base, reg, reg2,
5469                                     GEN_INT (-adjustment),
5470                                     GEN_INT (UNITS_PER_WORD - adjustment));
5471     case E_DFmode:
5472       return gen_storewb_pairdf_di (base, base, reg, reg2,
5473                                     GEN_INT (-adjustment),
5474                                     GEN_INT (UNITS_PER_WORD - adjustment));
5475     case E_TFmode:
5476       return gen_storewb_pairtf_di (base, base, reg, reg2,
5477                                     GEN_INT (-adjustment),
5478                                     GEN_INT (UNITS_PER_VREG - adjustment));
5479     default:
5480       gcc_unreachable ();
5481     }
5482 }
5483
5484 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
5485    stack pointer by ADJUSTMENT.  */
5486
5487 static void
5488 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
5489 {
5490   rtx_insn *insn;
5491   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5492
5493   if (regno2 == INVALID_REGNUM)
5494     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
5495
5496   rtx reg1 = gen_rtx_REG (mode, regno1);
5497   rtx reg2 = gen_rtx_REG (mode, regno2);
5498
5499   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
5500                                               reg2, adjustment));
5501   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
5502   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5503   RTX_FRAME_RELATED_P (insn) = 1;
5504 }
5505
5506 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
5507    adjusting it by ADJUSTMENT afterwards.  */
5508
5509 static rtx
5510 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5511                          HOST_WIDE_INT adjustment)
5512 {
5513   switch (mode)
5514     {
5515     case E_DImode:
5516       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
5517                                    GEN_INT (UNITS_PER_WORD));
5518     case E_DFmode:
5519       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
5520                                    GEN_INT (UNITS_PER_WORD));
5521     case E_TFmode:
5522       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
5523                                    GEN_INT (UNITS_PER_VREG));
5524     default:
5525       gcc_unreachable ();
5526     }
5527 }
5528
5529 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
5530    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
5531    into CFI_OPS.  */
5532
5533 static void
5534 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
5535                   rtx *cfi_ops)
5536 {
5537   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5538   rtx reg1 = gen_rtx_REG (mode, regno1);
5539
5540   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
5541
5542   if (regno2 == INVALID_REGNUM)
5543     {
5544       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
5545       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
5546       emit_move_insn (reg1, gen_frame_mem (mode, mem));
5547     }
5548   else
5549     {
5550       rtx reg2 = gen_rtx_REG (mode, regno2);
5551       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5552       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
5553                                           reg2, adjustment));
5554     }
5555 }
5556
5557 /* Generate and return a store pair instruction of mode MODE to store
5558    register REG1 to MEM1 and register REG2 to MEM2.  */
5559
5560 static rtx
5561 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
5562                         rtx reg2)
5563 {
5564   switch (mode)
5565     {
5566     case E_DImode:
5567       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
5568
5569     case E_DFmode:
5570       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
5571
5572     case E_TFmode:
5573       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
5574
5575     default:
5576       gcc_unreachable ();
5577     }
5578 }
5579
5580 /* Generate and regurn a load pair isntruction of mode MODE to load register
5581    REG1 from MEM1 and register REG2 from MEM2.  */
5582
5583 static rtx
5584 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
5585                        rtx mem2)
5586 {
5587   switch (mode)
5588     {
5589     case E_DImode:
5590       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
5591
5592     case E_DFmode:
5593       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
5594
5595     case E_TFmode:
5596       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
5597
5598     default:
5599       gcc_unreachable ();
5600     }
5601 }
5602
5603 /* Return TRUE if return address signing should be enabled for the current
5604    function, otherwise return FALSE.  */
5605
5606 bool
5607 aarch64_return_address_signing_enabled (void)
5608 {
5609   /* This function should only be called after frame laid out.   */
5610   gcc_assert (cfun->machine->frame.laid_out);
5611
5612   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
5613      if its LR is pushed onto stack.  */
5614   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
5615           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
5616               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
5617 }
5618
5619 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
5620 bool
5621 aarch64_bti_enabled (void)
5622 {
5623   return (aarch64_enable_bti == 1);
5624 }
5625
5626 /* Emit code to save the callee-saved registers from register number START
5627    to LIMIT to the stack at the location starting at offset START_OFFSET,
5628    skipping any write-back candidates if SKIP_WB is true.  */
5629
5630 static void
5631 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
5632                            unsigned start, unsigned limit, bool skip_wb)
5633 {
5634   rtx_insn *insn;
5635   unsigned regno;
5636   unsigned regno2;
5637
5638   for (regno = aarch64_next_callee_save (start, limit);
5639        regno <= limit;
5640        regno = aarch64_next_callee_save (regno + 1, limit))
5641     {
5642       rtx reg, mem;
5643       poly_int64 offset;
5644       int offset_diff;
5645
5646       if (skip_wb
5647           && (regno == cfun->machine->frame.wb_candidate1
5648               || regno == cfun->machine->frame.wb_candidate2))
5649         continue;
5650
5651       if (cfun->machine->reg_is_wrapped_separately[regno])
5652        continue;
5653
5654       reg = gen_rtx_REG (mode, regno);
5655       offset = start_offset + cfun->machine->frame.reg_offset[regno];
5656       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5657                                                 offset));
5658
5659       regno2 = aarch64_next_callee_save (regno + 1, limit);
5660       offset_diff = cfun->machine->frame.reg_offset[regno2]
5661                     - cfun->machine->frame.reg_offset[regno];
5662
5663       if (regno2 <= limit
5664           && !cfun->machine->reg_is_wrapped_separately[regno2]
5665           && known_eq (GET_MODE_SIZE (mode), offset_diff))
5666         {
5667           rtx reg2 = gen_rtx_REG (mode, regno2);
5668           rtx mem2;
5669
5670           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5671           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5672                                                      offset));
5673           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
5674                                                     reg2));
5675
5676           /* The first part of a frame-related parallel insn is
5677              always assumed to be relevant to the frame
5678              calculations; subsequent parts, are only
5679              frame-related if explicitly marked.  */
5680           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5681           regno = regno2;
5682         }
5683       else
5684         insn = emit_move_insn (mem, reg);
5685
5686       RTX_FRAME_RELATED_P (insn) = 1;
5687     }
5688 }
5689
5690 /* Emit code to restore the callee registers of mode MODE from register
5691    number START up to and including LIMIT.  Restore from the stack offset
5692    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
5693    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
5694
5695 static void
5696 aarch64_restore_callee_saves (machine_mode mode,
5697                               poly_int64 start_offset, unsigned start,
5698                               unsigned limit, bool skip_wb, rtx *cfi_ops)
5699 {
5700   rtx base_rtx = stack_pointer_rtx;
5701   unsigned regno;
5702   unsigned regno2;
5703   poly_int64 offset;
5704
5705   for (regno = aarch64_next_callee_save (start, limit);
5706        regno <= limit;
5707        regno = aarch64_next_callee_save (regno + 1, limit))
5708     {
5709       if (cfun->machine->reg_is_wrapped_separately[regno])
5710        continue;
5711
5712       rtx reg, mem;
5713       int offset_diff;
5714
5715       if (skip_wb
5716           && (regno == cfun->machine->frame.wb_candidate1
5717               || regno == cfun->machine->frame.wb_candidate2))
5718         continue;
5719
5720       reg = gen_rtx_REG (mode, regno);
5721       offset = start_offset + cfun->machine->frame.reg_offset[regno];
5722       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5723
5724       regno2 = aarch64_next_callee_save (regno + 1, limit);
5725       offset_diff = cfun->machine->frame.reg_offset[regno2]
5726                     - cfun->machine->frame.reg_offset[regno];
5727
5728       if (regno2 <= limit
5729           && !cfun->machine->reg_is_wrapped_separately[regno2]
5730           && known_eq (GET_MODE_SIZE (mode), offset_diff))
5731         {
5732           rtx reg2 = gen_rtx_REG (mode, regno2);
5733           rtx mem2;
5734
5735           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5736           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5737           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5738
5739           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5740           regno = regno2;
5741         }
5742       else
5743         emit_move_insn (reg, mem);
5744       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
5745     }
5746 }
5747
5748 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5749    of MODE.  */
5750
5751 static inline bool
5752 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5753 {
5754   HOST_WIDE_INT multiple;
5755   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5756           && IN_RANGE (multiple, -8, 7));
5757 }
5758
5759 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5760    of MODE.  */
5761
5762 static inline bool
5763 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5764 {
5765   HOST_WIDE_INT multiple;
5766   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5767           && IN_RANGE (multiple, 0, 63));
5768 }
5769
5770 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5771    of MODE.  */
5772
5773 bool
5774 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5775 {
5776   HOST_WIDE_INT multiple;
5777   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5778           && IN_RANGE (multiple, -64, 63));
5779 }
5780
5781 /* Return true if OFFSET is a signed 9-bit value.  */
5782
5783 bool
5784 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
5785                                        poly_int64 offset)
5786 {
5787   HOST_WIDE_INT const_offset;
5788   return (offset.is_constant (&const_offset)
5789           && IN_RANGE (const_offset, -256, 255));
5790 }
5791
5792 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5793    of MODE.  */
5794
5795 static inline bool
5796 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5797 {
5798   HOST_WIDE_INT multiple;
5799   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5800           && IN_RANGE (multiple, -256, 255));
5801 }
5802
5803 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5804    of MODE.  */
5805
5806 static inline bool
5807 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5808 {
5809   HOST_WIDE_INT multiple;
5810   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5811           && IN_RANGE (multiple, 0, 4095));
5812 }
5813
5814 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
5815
5816 static sbitmap
5817 aarch64_get_separate_components (void)
5818 {
5819   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5820   bitmap_clear (components);
5821
5822   /* The registers we need saved to the frame.  */
5823   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5824     if (aarch64_register_saved_on_entry (regno))
5825       {
5826         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5827         if (!frame_pointer_needed)
5828           offset += cfun->machine->frame.frame_size
5829                     - cfun->machine->frame.hard_fp_offset;
5830         /* Check that we can access the stack slot of the register with one
5831            direct load with no adjustments needed.  */
5832         if (offset_12bit_unsigned_scaled_p (DImode, offset))
5833           bitmap_set_bit (components, regno);
5834       }
5835
5836   /* Don't mess with the hard frame pointer.  */
5837   if (frame_pointer_needed)
5838     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5839
5840   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5841   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5842   /* If registers have been chosen to be stored/restored with
5843      writeback don't interfere with them to avoid having to output explicit
5844      stack adjustment instructions.  */
5845   if (reg2 != INVALID_REGNUM)
5846     bitmap_clear_bit (components, reg2);
5847   if (reg1 != INVALID_REGNUM)
5848     bitmap_clear_bit (components, reg1);
5849
5850   bitmap_clear_bit (components, LR_REGNUM);
5851   bitmap_clear_bit (components, SP_REGNUM);
5852
5853   return components;
5854 }
5855
5856 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
5857
5858 static sbitmap
5859 aarch64_components_for_bb (basic_block bb)
5860 {
5861   bitmap in = DF_LIVE_IN (bb);
5862   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5863   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5864   bool simd_function = aarch64_simd_decl_p (cfun->decl);
5865
5866   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5867   bitmap_clear (components);
5868
5869   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
5870   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5871     if ((!call_used_regs[regno]
5872         || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5873        && (bitmap_bit_p (in, regno)
5874            || bitmap_bit_p (gen, regno)
5875            || bitmap_bit_p (kill, regno)))
5876       {
5877         unsigned regno2, offset, offset2;
5878         bitmap_set_bit (components, regno);
5879
5880         /* If there is a callee-save at an adjacent offset, add it too
5881            to increase the use of LDP/STP.  */
5882         offset = cfun->machine->frame.reg_offset[regno];
5883         regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5884
5885         if (regno2 <= LAST_SAVED_REGNUM)
5886           {
5887             offset2 = cfun->machine->frame.reg_offset[regno2];
5888             if ((offset & ~8) == (offset2 & ~8))
5889               bitmap_set_bit (components, regno2);
5890           }
5891       }
5892
5893   return components;
5894 }
5895
5896 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5897    Nothing to do for aarch64.  */
5898
5899 static void
5900 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5901 {
5902 }
5903
5904 /* Return the next set bit in BMP from START onwards.  Return the total number
5905    of bits in BMP if no set bit is found at or after START.  */
5906
5907 static unsigned int
5908 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
5909 {
5910   unsigned int nbits = SBITMAP_SIZE (bmp);
5911   if (start == nbits)
5912     return start;
5913
5914   gcc_assert (start < nbits);
5915   for (unsigned int i = start; i < nbits; i++)
5916     if (bitmap_bit_p (bmp, i))
5917       return i;
5918
5919   return nbits;
5920 }
5921
5922 /* Do the work for aarch64_emit_prologue_components and
5923    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
5924    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5925    for these components or the epilogue sequence.  That is, it determines
5926    whether we should emit stores or loads and what kind of CFA notes to attach
5927    to the insns.  Otherwise the logic for the two sequences is very
5928    similar.  */
5929
5930 static void
5931 aarch64_process_components (sbitmap components, bool prologue_p)
5932 {
5933   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
5934                              ? HARD_FRAME_POINTER_REGNUM
5935                              : STACK_POINTER_REGNUM);
5936
5937   unsigned last_regno = SBITMAP_SIZE (components);
5938   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
5939   rtx_insn *insn = NULL;
5940
5941   while (regno != last_regno)
5942     {
5943       /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5944          so DFmode for the vector registers is enough.  For simd functions
5945          we want to save the low 128 bits.  */
5946       machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
5947
5948       rtx reg = gen_rtx_REG (mode, regno);
5949       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5950       if (!frame_pointer_needed)
5951         offset += cfun->machine->frame.frame_size
5952                   - cfun->machine->frame.hard_fp_offset;
5953       rtx addr = plus_constant (Pmode, ptr_reg, offset);
5954       rtx mem = gen_frame_mem (mode, addr);
5955
5956       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
5957       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
5958       /* No more registers to handle after REGNO.
5959          Emit a single save/restore and exit.  */
5960       if (regno2 == last_regno)
5961         {
5962           insn = emit_insn (set);
5963           RTX_FRAME_RELATED_P (insn) = 1;
5964           if (prologue_p)
5965             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5966           else
5967             add_reg_note (insn, REG_CFA_RESTORE, reg);
5968           break;
5969         }
5970
5971       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
5972       /* The next register is not of the same class or its offset is not
5973          mergeable with the current one into a pair.  */
5974       if (!satisfies_constraint_Ump (mem)
5975           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
5976           || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
5977           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
5978                        GET_MODE_SIZE (mode)))
5979         {
5980           insn = emit_insn (set);
5981           RTX_FRAME_RELATED_P (insn) = 1;
5982           if (prologue_p)
5983             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5984           else
5985             add_reg_note (insn, REG_CFA_RESTORE, reg);
5986
5987           regno = regno2;
5988           continue;
5989         }
5990
5991       /* REGNO2 can be saved/restored in a pair with REGNO.  */
5992       rtx reg2 = gen_rtx_REG (mode, regno2);
5993       if (!frame_pointer_needed)
5994         offset2 += cfun->machine->frame.frame_size
5995                   - cfun->machine->frame.hard_fp_offset;
5996       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
5997       rtx mem2 = gen_frame_mem (mode, addr2);
5998       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
5999                              : gen_rtx_SET (reg2, mem2);
6000
6001       if (prologue_p)
6002         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
6003       else
6004         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
6005
6006       RTX_FRAME_RELATED_P (insn) = 1;
6007       if (prologue_p)
6008         {
6009           add_reg_note (insn, REG_CFA_OFFSET, set);
6010           add_reg_note (insn, REG_CFA_OFFSET, set2);
6011         }
6012       else
6013         {
6014           add_reg_note (insn, REG_CFA_RESTORE, reg);
6015           add_reg_note (insn, REG_CFA_RESTORE, reg2);
6016         }
6017
6018       regno = aarch64_get_next_set_bit (components, regno2 + 1);
6019     }
6020 }
6021
6022 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
6023
6024 static void
6025 aarch64_emit_prologue_components (sbitmap components)
6026 {
6027   aarch64_process_components (components, true);
6028 }
6029
6030 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
6031
6032 static void
6033 aarch64_emit_epilogue_components (sbitmap components)
6034 {
6035   aarch64_process_components (components, false);
6036 }
6037
6038 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
6039
6040 static void
6041 aarch64_set_handled_components (sbitmap components)
6042 {
6043   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6044     if (bitmap_bit_p (components, regno))
6045       cfun->machine->reg_is_wrapped_separately[regno] = true;
6046 }
6047
6048 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
6049    determining the probe offset for alloca.  */
6050
6051 static HOST_WIDE_INT
6052 aarch64_stack_clash_protection_alloca_probe_range (void)
6053 {
6054   return STACK_CLASH_CALLER_GUARD;
6055 }
6056
6057
6058 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
6059    registers.  If POLY_SIZE is not large enough to require a probe this function
6060    will only adjust the stack.  When allocating the stack space
6061    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
6062    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
6063    arguments.  If we are then we ensure that any allocation larger than the ABI
6064    defined buffer needs a probe so that the invariant of having a 1KB buffer is
6065    maintained.
6066
6067    We emit barriers after each stack adjustment to prevent optimizations from
6068    breaking the invariant that we never drop the stack more than a page.  This
6069    invariant is needed to make it easier to correctly handle asynchronous
6070    events, e.g. if we were to allow the stack to be dropped by more than a page
6071    and then have multiple probes up and we take a signal somewhere in between
6072    then the signal handler doesn't know the state of the stack and can make no
6073    assumptions about which pages have been probed.  */
6074
6075 static void
6076 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
6077                                         poly_int64 poly_size,
6078                                         bool frame_related_p,
6079                                         bool final_adjustment_p)
6080 {
6081   HOST_WIDE_INT guard_size
6082     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6083   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6084   /* When doing the final adjustment for the outgoing argument size we can't
6085      assume that LR was saved at position 0.  So subtract it's offset from the
6086      ABI safe buffer so that we don't accidentally allow an adjustment that
6087      would result in an allocation larger than the ABI buffer without
6088      probing.  */
6089   HOST_WIDE_INT min_probe_threshold
6090     = final_adjustment_p
6091       ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
6092       : guard_size - guard_used_by_caller;
6093
6094   poly_int64 frame_size = cfun->machine->frame.frame_size;
6095
6096   /* We should always have a positive probe threshold.  */
6097   gcc_assert (min_probe_threshold > 0);
6098
6099   if (flag_stack_clash_protection && !final_adjustment_p)
6100     {
6101       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6102       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6103
6104       if (known_eq (frame_size, 0))
6105         {
6106           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
6107         }
6108       else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
6109                && known_lt (final_adjust, guard_used_by_caller))
6110         {
6111           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
6112         }
6113     }
6114
6115   /* If SIZE is not large enough to require probing, just adjust the stack and
6116      exit.  */
6117   if (known_lt (poly_size, min_probe_threshold)
6118       || !flag_stack_clash_protection)
6119     {
6120       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
6121       return;
6122     }
6123
6124   HOST_WIDE_INT size;
6125   /* Handle the SVE non-constant case first.  */
6126   if (!poly_size.is_constant (&size))
6127     {
6128      if (dump_file)
6129       {
6130         fprintf (dump_file, "Stack clash SVE prologue: ");
6131         print_dec (poly_size, dump_file);
6132         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
6133       }
6134
6135       /* First calculate the amount of bytes we're actually spilling.  */
6136       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
6137                           poly_size, temp1, temp2, false, true);
6138
6139       rtx_insn *insn = get_last_insn ();
6140
6141       if (frame_related_p)
6142         {
6143           /* This is done to provide unwinding information for the stack
6144              adjustments we're about to do, however to prevent the optimizers
6145              from removing the R11 move and leaving the CFA note (which would be
6146              very wrong) we tie the old and new stack pointer together.
6147              The tie will expand to nothing but the optimizers will not touch
6148              the instruction.  */
6149           rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6150           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
6151           emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
6152
6153           /* We want the CFA independent of the stack pointer for the
6154              duration of the loop.  */
6155           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
6156           RTX_FRAME_RELATED_P (insn) = 1;
6157         }
6158
6159       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
6160       rtx guard_const = gen_int_mode (guard_size, Pmode);
6161
6162       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
6163                                                    stack_pointer_rtx, temp1,
6164                                                    probe_const, guard_const));
6165
6166       /* Now reset the CFA register if needed.  */
6167       if (frame_related_p)
6168         {
6169           add_reg_note (insn, REG_CFA_DEF_CFA,
6170                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
6171                                       gen_int_mode (poly_size, Pmode)));
6172           RTX_FRAME_RELATED_P (insn) = 1;
6173         }
6174
6175       return;
6176     }
6177
6178   if (dump_file)
6179     fprintf (dump_file,
6180              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
6181              " bytes, probing will be required.\n", size);
6182
6183   /* Round size to the nearest multiple of guard_size, and calculate the
6184      residual as the difference between the original size and the rounded
6185      size.  */
6186   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
6187   HOST_WIDE_INT residual = size - rounded_size;
6188
6189   /* We can handle a small number of allocations/probes inline.  Otherwise
6190      punt to a loop.  */
6191   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
6192     {
6193       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
6194         {
6195           aarch64_sub_sp (NULL, temp2, guard_size, true);
6196           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6197                                            guard_used_by_caller));
6198           emit_insn (gen_blockage ());
6199         }
6200       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
6201     }
6202   else
6203     {
6204       /* Compute the ending address.  */
6205       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
6206                           temp1, NULL, false, true);
6207       rtx_insn *insn = get_last_insn ();
6208
6209       /* For the initial allocation, we don't have a frame pointer
6210          set up, so we always need CFI notes.  If we're doing the
6211          final allocation, then we may have a frame pointer, in which
6212          case it is the CFA, otherwise we need CFI notes.
6213
6214          We can determine which allocation we are doing by looking at
6215          the value of FRAME_RELATED_P since the final allocations are not
6216          frame related.  */
6217       if (frame_related_p)
6218         {
6219           /* We want the CFA independent of the stack pointer for the
6220              duration of the loop.  */
6221           add_reg_note (insn, REG_CFA_DEF_CFA,
6222                         plus_constant (Pmode, temp1, rounded_size));
6223           RTX_FRAME_RELATED_P (insn) = 1;
6224         }
6225
6226       /* This allocates and probes the stack.  Note that this re-uses some of
6227          the existing Ada stack protection code.  However we are guaranteed not
6228          to enter the non loop or residual branches of that code.
6229
6230          The non-loop part won't be entered because if our allocation amount
6231          doesn't require a loop, the case above would handle it.
6232
6233          The residual amount won't be entered because TEMP1 is a mutliple of
6234          the allocation size.  The residual will always be 0.  As such, the only
6235          part we are actually using from that code is the loop setup.  The
6236          actual probing is done in aarch64_output_probe_stack_range.  */
6237       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
6238                                                stack_pointer_rtx, temp1));
6239
6240       /* Now reset the CFA register if needed.  */
6241       if (frame_related_p)
6242         {
6243           add_reg_note (insn, REG_CFA_DEF_CFA,
6244                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
6245           RTX_FRAME_RELATED_P (insn) = 1;
6246         }
6247
6248       emit_insn (gen_blockage ());
6249       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
6250     }
6251
6252   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
6253      be probed.  This maintains the requirement that each page is probed at
6254      least once.  For initial probing we probe only if the allocation is
6255      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
6256      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
6257      GUARD_SIZE.  This works that for any allocation that is large enough to
6258      trigger a probe here, we'll have at least one, and if they're not large
6259      enough for this code to emit anything for them, The page would have been
6260      probed by the saving of FP/LR either by this function or any callees.  If
6261      we don't have any callees then we won't have more stack adjustments and so
6262      are still safe.  */
6263   if (residual)
6264     {
6265       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
6266       /* If we're doing final adjustments, and we've done any full page
6267          allocations then any residual needs to be probed.  */
6268       if (final_adjustment_p && rounded_size != 0)
6269         min_probe_threshold = 0;
6270       /* If doing a small final adjustment, we always probe at offset 0.
6271          This is done to avoid issues when LR is not at position 0 or when
6272          the final adjustment is smaller than the probing offset.  */
6273       else if (final_adjustment_p && rounded_size == 0)
6274         residual_probe_offset = 0;
6275
6276       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
6277       if (residual >= min_probe_threshold)
6278         {
6279           if (dump_file)
6280             fprintf (dump_file,
6281                      "Stack clash AArch64 prologue residuals: "
6282                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
6283                      "\n", residual);
6284
6285             emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6286                                              residual_probe_offset));
6287           emit_insn (gen_blockage ());
6288         }
6289     }
6290 }
6291
6292 /* Return 1 if the register is used by the epilogue.  We need to say the
6293    return register is used, but only after epilogue generation is complete.
6294    Note that in the case of sibcalls, the values "used by the epilogue" are
6295    considered live at the start of the called function.
6296
6297    For SIMD functions we need to return 1 for FP registers that are saved and
6298    restored by a function but are not zero in call_used_regs.  If we do not do
6299    this optimizations may remove the restore of the register.  */
6300
6301 int
6302 aarch64_epilogue_uses (int regno)
6303 {
6304   if (epilogue_completed)
6305     {
6306       if (regno == LR_REGNUM)
6307         return 1;
6308       if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
6309         return 1;
6310     }
6311   return 0;
6312 }
6313
6314 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6315    is saved at BASE + OFFSET.  */
6316
6317 static void
6318 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
6319                             rtx base, poly_int64 offset)
6320 {
6321   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
6322   add_reg_note (insn, REG_CFA_EXPRESSION,
6323                 gen_rtx_SET (mem, regno_reg_rtx[reg]));
6324 }
6325
6326 /* AArch64 stack frames generated by this compiler look like:
6327
6328         +-------------------------------+
6329         |                               |
6330         |  incoming stack arguments     |
6331         |                               |
6332         +-------------------------------+
6333         |                               | <-- incoming stack pointer (aligned)
6334         |  callee-allocated save area   |
6335         |  for register varargs         |
6336         |                               |
6337         +-------------------------------+
6338         |  local variables              | <-- frame_pointer_rtx
6339         |                               |
6340         +-------------------------------+
6341         |  padding                      | \
6342         +-------------------------------+  |
6343         |  callee-saved registers       |  | frame.saved_regs_size
6344         +-------------------------------+  |
6345         |  LR'                          |  |
6346         +-------------------------------+  |
6347         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
6348         +-------------------------------+
6349         |  dynamic allocation           |
6350         +-------------------------------+
6351         |  padding                      |
6352         +-------------------------------+
6353         |  outgoing stack arguments     | <-- arg_pointer
6354         |                               |
6355         +-------------------------------+
6356         |                               | <-- stack_pointer_rtx (aligned)
6357
6358    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
6359    but leave frame_pointer_rtx and hard_frame_pointer_rtx
6360    unchanged.
6361
6362    By default for stack-clash we assume the guard is at least 64KB, but this
6363    value is configurable to either 4KB or 64KB.  We also force the guard size to
6364    be the same as the probing interval and both values are kept in sync.
6365
6366    With those assumptions the callee can allocate up to 63KB (or 3KB depending
6367    on the guard size) of stack space without probing.
6368
6369    When probing is needed, we emit a probe at the start of the prologue
6370    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
6371
6372    We have to track how much space has been allocated and the only stores
6373    to the stack we track as implicit probes are the FP/LR stores.
6374
6375    For outgoing arguments we probe if the size is larger than 1KB, such that
6376    the ABI specified buffer is maintained for the next callee.
6377
6378    The following registers are reserved during frame layout and should not be
6379    used for any other purpose:
6380
6381    - r11: Used by stack clash protection when SVE is enabled.
6382    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
6383    - r14 and r15: Used for speculation tracking.
6384    - r16(IP0), r17(IP1): Used by indirect tailcalls.
6385    - r30(LR), r29(FP): Used by standard frame layout.
6386
6387    These registers must be avoided in frame layout related code unless the
6388    explicit intention is to interact with one of the features listed above.  */
6389
6390 /* Generate the prologue instructions for entry into a function.
6391    Establish the stack frame by decreasing the stack pointer with a
6392    properly calculated size and, if necessary, create a frame record
6393    filled with the values of LR and previous frame pointer.  The
6394    current FP is also set up if it is in use.  */
6395
6396 void
6397 aarch64_expand_prologue (void)
6398 {
6399   poly_int64 frame_size = cfun->machine->frame.frame_size;
6400   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6401   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6402   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6403   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6404   unsigned reg1 = cfun->machine->frame.wb_candidate1;
6405   unsigned reg2 = cfun->machine->frame.wb_candidate2;
6406   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
6407   rtx_insn *insn;
6408
6409   /* Sign return address for functions.  */
6410   if (aarch64_return_address_signing_enabled ())
6411     {
6412       switch (aarch64_ra_sign_key)
6413         {
6414           case AARCH64_KEY_A:
6415             insn = emit_insn (gen_paciasp ());
6416             break;
6417           case AARCH64_KEY_B:
6418             insn = emit_insn (gen_pacibsp ());
6419             break;
6420           default:
6421             gcc_unreachable ();
6422         }
6423       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6424       RTX_FRAME_RELATED_P (insn) = 1;
6425     }
6426
6427   if (flag_stack_usage_info)
6428     current_function_static_stack_size = constant_lower_bound (frame_size);
6429
6430   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
6431     {
6432       if (crtl->is_leaf && !cfun->calls_alloca)
6433         {
6434           if (maybe_gt (frame_size, PROBE_INTERVAL)
6435               && maybe_gt (frame_size, get_stack_check_protect ()))
6436             aarch64_emit_probe_stack_range (get_stack_check_protect (),
6437                                             (frame_size
6438                                              - get_stack_check_protect ()));
6439         }
6440       else if (maybe_gt (frame_size, 0))
6441         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
6442     }
6443
6444   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6445   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6446
6447   /* In theory we should never have both an initial adjustment
6448      and a callee save adjustment.  Verify that is the case since the
6449      code below does not handle it for -fstack-clash-protection.  */
6450   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
6451
6452   /* Will only probe if the initial adjustment is larger than the guard
6453      less the amount of the guard reserved for use by the caller's
6454      outgoing args.  */
6455   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
6456                                           true, false);
6457
6458   if (callee_adjust != 0)
6459     aarch64_push_regs (reg1, reg2, callee_adjust);
6460
6461   if (emit_frame_chain)
6462     {
6463       poly_int64 reg_offset = callee_adjust;
6464       if (callee_adjust == 0)
6465         {
6466           reg1 = R29_REGNUM;
6467           reg2 = R30_REGNUM;
6468           reg_offset = callee_offset;
6469           aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
6470         }
6471       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
6472                           stack_pointer_rtx, callee_offset,
6473                           tmp1_rtx, tmp0_rtx, frame_pointer_needed);
6474       if (frame_pointer_needed && !frame_size.is_constant ())
6475         {
6476           /* Variable-sized frames need to describe the save slot
6477              address using DW_CFA_expression rather than DW_CFA_offset.
6478              This means that, without taking further action, the
6479              locations of the registers that we've already saved would
6480              remain based on the stack pointer even after we redefine
6481              the CFA based on the frame pointer.  We therefore need new
6482              DW_CFA_expressions to re-express the save slots with addresses
6483              based on the frame pointer.  */
6484           rtx_insn *insn = get_last_insn ();
6485           gcc_assert (RTX_FRAME_RELATED_P (insn));
6486
6487           /* Add an explicit CFA definition if this was previously
6488              implicit.  */
6489           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
6490             {
6491               rtx src = plus_constant (Pmode, stack_pointer_rtx,
6492                                        callee_offset);
6493               add_reg_note (insn, REG_CFA_ADJUST_CFA,
6494                             gen_rtx_SET (hard_frame_pointer_rtx, src));
6495             }
6496
6497           /* Change the save slot expressions for the registers that
6498              we've already saved.  */
6499           reg_offset -= callee_offset;
6500           aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
6501                                       reg_offset + UNITS_PER_WORD);
6502           aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
6503                                       reg_offset);
6504         }
6505       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
6506     }
6507
6508   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6509                              callee_adjust != 0 || emit_frame_chain);
6510   if (aarch64_simd_decl_p (cfun->decl))
6511     aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6512                                callee_adjust != 0 || emit_frame_chain);
6513   else
6514     aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6515                                callee_adjust != 0 || emit_frame_chain);
6516
6517   /* We may need to probe the final adjustment if it is larger than the guard
6518      that is assumed by the called.  */
6519   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
6520                                           !frame_pointer_needed, true);
6521 }
6522
6523 /* Return TRUE if we can use a simple_return insn.
6524
6525    This function checks whether the callee saved stack is empty, which
6526    means no restore actions are need. The pro_and_epilogue will use
6527    this to check whether shrink-wrapping opt is feasible.  */
6528
6529 bool
6530 aarch64_use_return_insn_p (void)
6531 {
6532   if (!reload_completed)
6533     return false;
6534
6535   if (crtl->profile)
6536     return false;
6537
6538   return known_eq (cfun->machine->frame.frame_size, 0);
6539 }
6540
6541 /* Return false for non-leaf SIMD functions in order to avoid
6542    shrink-wrapping them.  Doing this will lose the necessary
6543    save/restore of FP registers.  */
6544
6545 bool
6546 aarch64_use_simple_return_insn_p (void)
6547 {
6548   if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
6549     return false;
6550
6551   return true;
6552 }
6553
6554 /* Generate the epilogue instructions for returning from a function.
6555    This is almost exactly the reverse of the prolog sequence, except
6556    that we need to insert barriers to avoid scheduling loads that read
6557    from a deallocated stack, and we optimize the unwind records by
6558    emitting them all together if possible.  */
6559 void
6560 aarch64_expand_epilogue (bool for_sibcall)
6561 {
6562   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6563   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6564   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6565   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6566   unsigned reg1 = cfun->machine->frame.wb_candidate1;
6567   unsigned reg2 = cfun->machine->frame.wb_candidate2;
6568   rtx cfi_ops = NULL;
6569   rtx_insn *insn;
6570   /* A stack clash protection prologue may not have left EP0_REGNUM or
6571      EP1_REGNUM in a usable state.  The same is true for allocations
6572      with an SVE component, since we then need both temporary registers
6573      for each allocation.  For stack clash we are in a usable state if
6574      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
6575   HOST_WIDE_INT guard_size
6576     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6577   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6578
6579   /* We can re-use the registers when the allocation amount is smaller than
6580      guard_size - guard_used_by_caller because we won't be doing any probes
6581      then.  In such situations the register should remain live with the correct
6582      value.  */
6583   bool can_inherit_p = (initial_adjust.is_constant ()
6584                         && final_adjust.is_constant ())
6585                         && (!flag_stack_clash_protection
6586                             || known_lt (initial_adjust,
6587                                          guard_size - guard_used_by_caller));
6588
6589   /* We need to add memory barrier to prevent read from deallocated stack.  */
6590   bool need_barrier_p
6591     = maybe_ne (get_frame_size ()
6592                 + cfun->machine->frame.saved_varargs_size, 0);
6593
6594   /* Emit a barrier to prevent loads from a deallocated stack.  */
6595   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
6596       || cfun->calls_alloca
6597       || crtl->calls_eh_return)
6598     {
6599       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6600       need_barrier_p = false;
6601     }
6602
6603   /* Restore the stack pointer from the frame pointer if it may not
6604      be the same as the stack pointer.  */
6605   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6606   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6607   if (frame_pointer_needed
6608       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
6609     /* If writeback is used when restoring callee-saves, the CFA
6610        is restored on the instruction doing the writeback.  */
6611     aarch64_add_offset (Pmode, stack_pointer_rtx,
6612                         hard_frame_pointer_rtx, -callee_offset,
6613                         tmp1_rtx, tmp0_rtx, callee_adjust == 0);
6614   else
6615      /* The case where we need to re-use the register here is very rare, so
6616         avoid the complicated condition and just always emit a move if the
6617         immediate doesn't fit.  */
6618      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
6619
6620   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6621                                 callee_adjust != 0, &cfi_ops);
6622   if (aarch64_simd_decl_p (cfun->decl))
6623     aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6624                                   callee_adjust != 0, &cfi_ops);
6625   else
6626     aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6627                                   callee_adjust != 0, &cfi_ops);
6628
6629   if (need_barrier_p)
6630     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6631
6632   if (callee_adjust != 0)
6633     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
6634
6635   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
6636     {
6637       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
6638       insn = get_last_insn ();
6639       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
6640       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
6641       RTX_FRAME_RELATED_P (insn) = 1;
6642       cfi_ops = NULL;
6643     }
6644
6645   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
6646      add restriction on emit_move optimization to leaf functions.  */
6647   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
6648                   (!can_inherit_p || !crtl->is_leaf
6649                    || df_regs_ever_live_p (EP0_REGNUM)));
6650
6651   if (cfi_ops)
6652     {
6653       /* Emit delayed restores and reset the CFA to be SP.  */
6654       insn = get_last_insn ();
6655       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
6656       REG_NOTES (insn) = cfi_ops;
6657       RTX_FRAME_RELATED_P (insn) = 1;
6658     }
6659
6660   /* We prefer to emit the combined return/authenticate instruction RETAA,
6661      however there are three cases in which we must instead emit an explicit
6662      authentication instruction.
6663
6664         1) Sibcalls don't return in a normal way, so if we're about to call one
6665            we must authenticate.
6666
6667         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
6668            generating code for !TARGET_ARMV8_3 we can't use it and must
6669            explicitly authenticate.
6670
6671         3) On an eh_return path we make extra stack adjustments to update the
6672            canonical frame address to be the exception handler's CFA.  We want
6673            to authenticate using the CFA of the function which calls eh_return.
6674     */
6675   if (aarch64_return_address_signing_enabled ()
6676       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
6677     {
6678       switch (aarch64_ra_sign_key)
6679         {
6680           case AARCH64_KEY_A:
6681             insn = emit_insn (gen_autiasp ());
6682             break;
6683           case AARCH64_KEY_B:
6684             insn = emit_insn (gen_autibsp ());
6685             break;
6686           default:
6687             gcc_unreachable ();
6688         }
6689       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6690       RTX_FRAME_RELATED_P (insn) = 1;
6691     }
6692
6693   /* Stack adjustment for exception handler.  */
6694   if (crtl->calls_eh_return && !for_sibcall)
6695     {
6696       /* We need to unwind the stack by the offset computed by
6697          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
6698          to be SP; letting the CFA move during this adjustment
6699          is just as correct as retaining the CFA from the body
6700          of the function.  Therefore, do nothing special.  */
6701       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
6702     }
6703
6704   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
6705   if (!for_sibcall)
6706     emit_jump_insn (ret_rtx);
6707 }
6708
6709 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
6710    normally or return to a previous frame after unwinding.
6711
6712    An EH return uses a single shared return sequence.  The epilogue is
6713    exactly like a normal epilogue except that it has an extra input
6714    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
6715    that must be applied after the frame has been destroyed.  An extra label
6716    is inserted before the epilogue which initializes this register to zero,
6717    and this is the entry point for a normal return.
6718
6719    An actual EH return updates the return address, initializes the stack
6720    adjustment and jumps directly into the epilogue (bypassing the zeroing
6721    of the adjustment).  Since the return address is typically saved on the
6722    stack when a function makes a call, the saved LR must be updated outside
6723    the epilogue.
6724
6725    This poses problems as the store is generated well before the epilogue,
6726    so the offset of LR is not known yet.  Also optimizations will remove the
6727    store as it appears dead, even after the epilogue is generated (as the
6728    base or offset for loading LR is different in many cases).
6729
6730    To avoid these problems this implementation forces the frame pointer
6731    in eh_return functions so that the location of LR is fixed and known early.
6732    It also marks the store volatile, so no optimization is permitted to
6733    remove the store.  */
6734 rtx
6735 aarch64_eh_return_handler_rtx (void)
6736 {
6737   rtx tmp = gen_frame_mem (Pmode,
6738     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
6739
6740   /* Mark the store volatile, so no optimization is permitted to remove it.  */
6741   MEM_VOLATILE_P (tmp) = true;
6742   return tmp;
6743 }
6744
6745 /* Output code to add DELTA to the first argument, and then jump
6746    to FUNCTION.  Used for C++ multiple inheritance.  */
6747 static void
6748 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
6749                          HOST_WIDE_INT delta,
6750                          HOST_WIDE_INT vcall_offset,
6751                          tree function)
6752 {
6753   /* The this pointer is always in x0.  Note that this differs from
6754      Arm where the this pointer maybe bumped to r1 if r0 is required
6755      to return a pointer to an aggregate.  On AArch64 a result value
6756      pointer will be in x8.  */
6757   int this_regno = R0_REGNUM;
6758   rtx this_rtx, temp0, temp1, addr, funexp;
6759   rtx_insn *insn;
6760   const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
6761
6762   if (aarch64_bti_enabled ())
6763     emit_insn (gen_bti_c());
6764
6765   reload_completed = 1;
6766   emit_note (NOTE_INSN_PROLOGUE_END);
6767
6768   this_rtx = gen_rtx_REG (Pmode, this_regno);
6769   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
6770   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
6771
6772   if (vcall_offset == 0)
6773     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
6774   else
6775     {
6776       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
6777
6778       addr = this_rtx;
6779       if (delta != 0)
6780         {
6781           if (delta >= -256 && delta < 256)
6782             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
6783                                        plus_constant (Pmode, this_rtx, delta));
6784           else
6785             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
6786                                 temp1, temp0, false);
6787         }
6788
6789       if (Pmode == ptr_mode)
6790         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
6791       else
6792         aarch64_emit_move (temp0,
6793                            gen_rtx_ZERO_EXTEND (Pmode,
6794                                                 gen_rtx_MEM (ptr_mode, addr)));
6795
6796       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
6797           addr = plus_constant (Pmode, temp0, vcall_offset);
6798       else
6799         {
6800           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
6801                                           Pmode);
6802           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
6803         }
6804
6805       if (Pmode == ptr_mode)
6806         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
6807       else
6808         aarch64_emit_move (temp1,
6809                            gen_rtx_SIGN_EXTEND (Pmode,
6810                                                 gen_rtx_MEM (ptr_mode, addr)));
6811
6812       emit_insn (gen_add2_insn (this_rtx, temp1));
6813     }
6814
6815   /* Generate a tail call to the target function.  */
6816   if (!TREE_USED (function))
6817     {
6818       assemble_external (function);
6819       TREE_USED (function) = 1;
6820     }
6821   funexp = XEXP (DECL_RTL (function), 0);
6822   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6823   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
6824   SIBLING_CALL_P (insn) = 1;
6825
6826   insn = get_insns ();
6827   shorten_branches (insn);
6828
6829   assemble_start_function (thunk, fnname);
6830   final_start_function (insn, file, 1);
6831   final (insn, file, 1);
6832   final_end_function ();
6833   assemble_end_function (thunk, fnname);
6834
6835   /* Stop pretending to be a post-reload pass.  */
6836   reload_completed = 0;
6837 }
6838
6839 static bool
6840 aarch64_tls_referenced_p (rtx x)
6841 {
6842   if (!TARGET_HAVE_TLS)
6843     return false;
6844   subrtx_iterator::array_type array;
6845   FOR_EACH_SUBRTX (iter, array, x, ALL)
6846     {
6847       const_rtx x = *iter;
6848       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6849         return true;
6850       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6851          TLS offsets, not real symbol references.  */
6852       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6853         iter.skip_subrtxes ();
6854     }
6855   return false;
6856 }
6857
6858
6859 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6860    a left shift of 0 or 12 bits.  */
6861 bool
6862 aarch64_uimm12_shift (HOST_WIDE_INT val)
6863 {
6864   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6865           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6866           );
6867 }
6868
6869 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6870    that can be created with a left shift of 0 or 12.  */
6871 static HOST_WIDE_INT
6872 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6873 {
6874   /* Check to see if the value fits in 24 bits, as that is the maximum we can
6875      handle correctly.  */
6876   gcc_assert ((val & 0xffffff) == val);
6877
6878   if (((val & 0xfff) << 0) == val)
6879     return val;
6880
6881   return val & (0xfff << 12);
6882 }
6883
6884 /* Return true if val is an immediate that can be loaded into a
6885    register by a MOVZ instruction.  */
6886 static bool
6887 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6888 {
6889   if (GET_MODE_SIZE (mode) > 4)
6890     {
6891       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6892           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6893         return 1;
6894     }
6895   else
6896     {
6897       /* Ignore sign extension.  */
6898       val &= (HOST_WIDE_INT) 0xffffffff;
6899     }
6900   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6901           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6902 }
6903
6904 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
6905    64-bit (DImode) integer.  */
6906
6907 static unsigned HOST_WIDE_INT
6908 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
6909 {
6910   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
6911   while (size < 64)
6912     {
6913       val &= (HOST_WIDE_INT_1U << size) - 1;
6914       val |= val << size;
6915       size *= 2;
6916     }
6917   return val;
6918 }
6919
6920 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
6921
6922 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
6923   {
6924     0x0000000100000001ull,
6925     0x0001000100010001ull,
6926     0x0101010101010101ull,
6927     0x1111111111111111ull,
6928     0x5555555555555555ull,
6929   };
6930
6931
6932 /* Return true if val is a valid bitmask immediate.  */
6933
6934 bool
6935 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
6936 {
6937   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
6938   int bits;
6939
6940   /* Check for a single sequence of one bits and return quickly if so.
6941      The special cases of all ones and all zeroes returns false.  */
6942   val = aarch64_replicate_bitmask_imm (val_in, mode);
6943   tmp = val + (val & -val);
6944
6945   if (tmp == (tmp & -tmp))
6946     return (val + 1) > 1;
6947
6948   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
6949   if (mode == SImode)
6950     val = (val << 32) | (val & 0xffffffff);
6951
6952   /* Invert if the immediate doesn't start with a zero bit - this means we
6953      only need to search for sequences of one bits.  */
6954   if (val & 1)
6955     val = ~val;
6956
6957   /* Find the first set bit and set tmp to val with the first sequence of one
6958      bits removed.  Return success if there is a single sequence of ones.  */
6959   first_one = val & -val;
6960   tmp = val & (val + first_one);
6961
6962   if (tmp == 0)
6963     return true;
6964
6965   /* Find the next set bit and compute the difference in bit position.  */
6966   next_one = tmp & -tmp;
6967   bits = clz_hwi (first_one) - clz_hwi (next_one);
6968   mask = val ^ tmp;
6969
6970   /* Check the bit position difference is a power of 2, and that the first
6971      sequence of one bits fits within 'bits' bits.  */
6972   if ((mask >> bits) != 0 || bits != (bits & -bits))
6973     return false;
6974
6975   /* Check the sequence of one bits is repeated 64/bits times.  */
6976   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
6977 }
6978
6979 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
6980    Assumed precondition: VAL_IN Is not zero.  */
6981
6982 unsigned HOST_WIDE_INT
6983 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
6984 {
6985   int lowest_bit_set = ctz_hwi (val_in);
6986   int highest_bit_set = floor_log2 (val_in);
6987   gcc_assert (val_in != 0);
6988
6989   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
6990           (HOST_WIDE_INT_1U << lowest_bit_set));
6991 }
6992
6993 /* Create constant where bits outside of lowest bit set to highest bit set
6994    are set to 1.  */
6995
6996 unsigned HOST_WIDE_INT
6997 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
6998 {
6999   return val_in | ~aarch64_and_split_imm1 (val_in);
7000 }
7001
7002 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
7003
7004 bool
7005 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
7006 {
7007   scalar_int_mode int_mode;
7008   if (!is_a <scalar_int_mode> (mode, &int_mode))
7009     return false;
7010
7011   if (aarch64_bitmask_imm (val_in, int_mode))
7012     return false;
7013
7014   if (aarch64_move_imm (val_in, int_mode))
7015     return false;
7016
7017   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
7018
7019   return aarch64_bitmask_imm (imm2, int_mode);
7020 }
7021
7022 /* Return true if val is an immediate that can be loaded into a
7023    register in a single instruction.  */
7024 bool
7025 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
7026 {
7027   scalar_int_mode int_mode;
7028   if (!is_a <scalar_int_mode> (mode, &int_mode))
7029     return false;
7030
7031   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
7032     return 1;
7033   return aarch64_bitmask_imm (val, int_mode);
7034 }
7035
7036 static bool
7037 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
7038 {
7039   rtx base, offset;
7040
7041   if (GET_CODE (x) == HIGH)
7042     return true;
7043
7044   /* There's no way to calculate VL-based values using relocations.  */
7045   subrtx_iterator::array_type array;
7046   FOR_EACH_SUBRTX (iter, array, x, ALL)
7047     if (GET_CODE (*iter) == CONST_POLY_INT)
7048       return true;
7049
7050   split_const (x, &base, &offset);
7051   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
7052     {
7053       if (aarch64_classify_symbol (base, INTVAL (offset))
7054           != SYMBOL_FORCE_TO_MEM)
7055         return true;
7056       else
7057         /* Avoid generating a 64-bit relocation in ILP32; leave
7058            to aarch64_expand_mov_immediate to handle it properly.  */
7059         return mode != ptr_mode;
7060     }
7061
7062   return aarch64_tls_referenced_p (x);
7063 }
7064
7065 /* Implement TARGET_CASE_VALUES_THRESHOLD.
7066    The expansion for a table switch is quite expensive due to the number
7067    of instructions, the table lookup and hard to predict indirect jump.
7068    When optimizing for speed, and -O3 enabled, use the per-core tuning if
7069    set, otherwise use tables for > 16 cases as a tradeoff between size and
7070    performance.  When optimizing for size, use the default setting.  */
7071
7072 static unsigned int
7073 aarch64_case_values_threshold (void)
7074 {
7075   /* Use the specified limit for the number of cases before using jump
7076      tables at higher optimization levels.  */
7077   if (optimize > 2
7078       && selected_cpu->tune->max_case_values != 0)
7079     return selected_cpu->tune->max_case_values;
7080   else
7081     return optimize_size ? default_case_values_threshold () : 17;
7082 }
7083
7084 /* Return true if register REGNO is a valid index register.
7085    STRICT_P is true if REG_OK_STRICT is in effect.  */
7086
7087 bool
7088 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
7089 {
7090   if (!HARD_REGISTER_NUM_P (regno))
7091     {
7092       if (!strict_p)
7093         return true;
7094
7095       if (!reg_renumber)
7096         return false;
7097
7098       regno = reg_renumber[regno];
7099     }
7100   return GP_REGNUM_P (regno);
7101 }
7102
7103 /* Return true if register REGNO is a valid base register for mode MODE.
7104    STRICT_P is true if REG_OK_STRICT is in effect.  */
7105
7106 bool
7107 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
7108 {
7109   if (!HARD_REGISTER_NUM_P (regno))
7110     {
7111       if (!strict_p)
7112         return true;
7113
7114       if (!reg_renumber)
7115         return false;
7116
7117       regno = reg_renumber[regno];
7118     }
7119
7120   /* The fake registers will be eliminated to either the stack or
7121      hard frame pointer, both of which are usually valid base registers.
7122      Reload deals with the cases where the eliminated form isn't valid.  */
7123   return (GP_REGNUM_P (regno)
7124           || regno == SP_REGNUM
7125           || regno == FRAME_POINTER_REGNUM
7126           || regno == ARG_POINTER_REGNUM);
7127 }
7128
7129 /* Return true if X is a valid base register for mode MODE.
7130    STRICT_P is true if REG_OK_STRICT is in effect.  */
7131
7132 static bool
7133 aarch64_base_register_rtx_p (rtx x, bool strict_p)
7134 {
7135   if (!strict_p
7136       && GET_CODE (x) == SUBREG
7137       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
7138     x = SUBREG_REG (x);
7139
7140   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
7141 }
7142
7143 /* Return true if address offset is a valid index.  If it is, fill in INFO
7144    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
7145
7146 static bool
7147 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
7148                         machine_mode mode, bool strict_p)
7149 {
7150   enum aarch64_address_type type;
7151   rtx index;
7152   int shift;
7153
7154   /* (reg:P) */
7155   if ((REG_P (x) || GET_CODE (x) == SUBREG)
7156       && GET_MODE (x) == Pmode)
7157     {
7158       type = ADDRESS_REG_REG;
7159       index = x;
7160       shift = 0;
7161     }
7162   /* (sign_extend:DI (reg:SI)) */
7163   else if ((GET_CODE (x) == SIGN_EXTEND
7164             || GET_CODE (x) == ZERO_EXTEND)
7165            && GET_MODE (x) == DImode
7166            && GET_MODE (XEXP (x, 0)) == SImode)
7167     {
7168       type = (GET_CODE (x) == SIGN_EXTEND)
7169         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7170       index = XEXP (x, 0);
7171       shift = 0;
7172     }
7173   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
7174   else if (GET_CODE (x) == MULT
7175            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7176                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7177            && GET_MODE (XEXP (x, 0)) == DImode
7178            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7179            && CONST_INT_P (XEXP (x, 1)))
7180     {
7181       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7182         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7183       index = XEXP (XEXP (x, 0), 0);
7184       shift = exact_log2 (INTVAL (XEXP (x, 1)));
7185     }
7186   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
7187   else if (GET_CODE (x) == ASHIFT
7188            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7189                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7190            && GET_MODE (XEXP (x, 0)) == DImode
7191            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7192            && CONST_INT_P (XEXP (x, 1)))
7193     {
7194       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7195         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7196       index = XEXP (XEXP (x, 0), 0);
7197       shift = INTVAL (XEXP (x, 1));
7198     }
7199   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
7200   else if ((GET_CODE (x) == SIGN_EXTRACT
7201             || GET_CODE (x) == ZERO_EXTRACT)
7202            && GET_MODE (x) == DImode
7203            && GET_CODE (XEXP (x, 0)) == MULT
7204            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7205            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7206     {
7207       type = (GET_CODE (x) == SIGN_EXTRACT)
7208         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7209       index = XEXP (XEXP (x, 0), 0);
7210       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7211       if (INTVAL (XEXP (x, 1)) != 32 + shift
7212           || INTVAL (XEXP (x, 2)) != 0)
7213         shift = -1;
7214     }
7215   /* (and:DI (mult:DI (reg:DI) (const_int scale))
7216      (const_int 0xffffffff<<shift)) */
7217   else if (GET_CODE (x) == AND
7218            && GET_MODE (x) == DImode
7219            && GET_CODE (XEXP (x, 0)) == MULT
7220            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7221            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7222            && CONST_INT_P (XEXP (x, 1)))
7223     {
7224       type = ADDRESS_REG_UXTW;
7225       index = XEXP (XEXP (x, 0), 0);
7226       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7227       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7228         shift = -1;
7229     }
7230   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
7231   else if ((GET_CODE (x) == SIGN_EXTRACT
7232             || GET_CODE (x) == ZERO_EXTRACT)
7233            && GET_MODE (x) == DImode
7234            && GET_CODE (XEXP (x, 0)) == ASHIFT
7235            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7236            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7237     {
7238       type = (GET_CODE (x) == SIGN_EXTRACT)
7239         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7240       index = XEXP (XEXP (x, 0), 0);
7241       shift = INTVAL (XEXP (XEXP (x, 0), 1));
7242       if (INTVAL (XEXP (x, 1)) != 32 + shift
7243           || INTVAL (XEXP (x, 2)) != 0)
7244         shift = -1;
7245     }
7246   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
7247      (const_int 0xffffffff<<shift)) */
7248   else if (GET_CODE (x) == AND
7249            && GET_MODE (x) == DImode
7250            && GET_CODE (XEXP (x, 0)) == ASHIFT
7251            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7252            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7253            && CONST_INT_P (XEXP (x, 1)))
7254     {
7255       type = ADDRESS_REG_UXTW;
7256       index = XEXP (XEXP (x, 0), 0);
7257       shift = INTVAL (XEXP (XEXP (x, 0), 1));
7258       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7259         shift = -1;
7260     }
7261   /* (mult:P (reg:P) (const_int scale)) */
7262   else if (GET_CODE (x) == MULT
7263            && GET_MODE (x) == Pmode
7264            && GET_MODE (XEXP (x, 0)) == Pmode
7265            && CONST_INT_P (XEXP (x, 1)))
7266     {
7267       type = ADDRESS_REG_REG;
7268       index = XEXP (x, 0);
7269       shift = exact_log2 (INTVAL (XEXP (x, 1)));
7270     }
7271   /* (ashift:P (reg:P) (const_int shift)) */
7272   else if (GET_CODE (x) == ASHIFT
7273            && GET_MODE (x) == Pmode
7274            && GET_MODE (XEXP (x, 0)) == Pmode
7275            && CONST_INT_P (XEXP (x, 1)))
7276     {
7277       type = ADDRESS_REG_REG;
7278       index = XEXP (x, 0);
7279       shift = INTVAL (XEXP (x, 1));
7280     }
7281   else
7282     return false;
7283
7284   if (!strict_p
7285       && GET_CODE (index) == SUBREG
7286       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
7287     index = SUBREG_REG (index);
7288
7289   if (aarch64_sve_data_mode_p (mode))
7290     {
7291       if (type != ADDRESS_REG_REG
7292           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
7293         return false;
7294     }
7295   else
7296     {
7297       if (shift != 0
7298           && !(IN_RANGE (shift, 1, 3)
7299                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
7300         return false;
7301     }
7302
7303   if (REG_P (index)
7304       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
7305     {
7306       info->type = type;
7307       info->offset = index;
7308       info->shift = shift;
7309       return true;
7310     }
7311
7312   return false;
7313 }
7314
7315 /* Return true if MODE is one of the modes for which we
7316    support LDP/STP operations.  */
7317
7318 static bool
7319 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
7320 {
7321   return mode == SImode || mode == DImode
7322          || mode == SFmode || mode == DFmode
7323          || (aarch64_vector_mode_supported_p (mode)
7324              && (known_eq (GET_MODE_SIZE (mode), 8)
7325                  || (known_eq (GET_MODE_SIZE (mode), 16)
7326                     && (aarch64_tune_params.extra_tuning_flags
7327                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
7328 }
7329
7330 /* Return true if REGNO is a virtual pointer register, or an eliminable
7331    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
7332    include stack_pointer or hard_frame_pointer.  */
7333 static bool
7334 virt_or_elim_regno_p (unsigned regno)
7335 {
7336   return ((regno >= FIRST_VIRTUAL_REGISTER
7337            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
7338           || regno == FRAME_POINTER_REGNUM
7339           || regno == ARG_POINTER_REGNUM);
7340 }
7341
7342 /* Return true if X is a valid address of type TYPE for machine mode MODE.
7343    If it is, fill in INFO appropriately.  STRICT_P is true if
7344    REG_OK_STRICT is in effect.  */
7345
7346 bool
7347 aarch64_classify_address (struct aarch64_address_info *info,
7348                           rtx x, machine_mode mode, bool strict_p,
7349                           aarch64_addr_query_type type)
7350 {
7351   enum rtx_code code = GET_CODE (x);
7352   rtx op0, op1;
7353   poly_int64 offset;
7354
7355   HOST_WIDE_INT const_size;
7356
7357   /* On BE, we use load/store pair for all large int mode load/stores.
7358      TI/TFmode may also use a load/store pair.  */
7359   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7360   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
7361   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
7362                             || type == ADDR_QUERY_LDP_STP_N
7363                             || mode == TImode
7364                             || mode == TFmode
7365                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
7366
7367   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
7368      corresponds to the actual size of the memory being loaded/stored and the
7369      mode of the corresponding addressing mode is half of that.  */
7370   if (type == ADDR_QUERY_LDP_STP_N
7371       && known_eq (GET_MODE_SIZE (mode), 16))
7372     mode = DFmode;
7373
7374   bool allow_reg_index_p = (!load_store_pair_p
7375                             && (known_lt (GET_MODE_SIZE (mode), 16)
7376                                 || vec_flags == VEC_ADVSIMD
7377                                 || vec_flags & VEC_SVE_DATA));
7378
7379   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
7380      [Rn, #offset, MUL VL].  */
7381   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
7382       && (code != REG && code != PLUS))
7383     return false;
7384
7385   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
7386      REG addressing.  */
7387   if (advsimd_struct_p
7388       && !BYTES_BIG_ENDIAN
7389       && (code != POST_INC && code != REG))
7390     return false;
7391
7392   gcc_checking_assert (GET_MODE (x) == VOIDmode
7393                        || SCALAR_INT_MODE_P (GET_MODE (x)));
7394
7395   switch (code)
7396     {
7397     case REG:
7398     case SUBREG:
7399       info->type = ADDRESS_REG_IMM;
7400       info->base = x;
7401       info->offset = const0_rtx;
7402       info->const_offset = 0;
7403       return aarch64_base_register_rtx_p (x, strict_p);
7404
7405     case PLUS:
7406       op0 = XEXP (x, 0);
7407       op1 = XEXP (x, 1);
7408
7409       if (! strict_p
7410           && REG_P (op0)
7411           && virt_or_elim_regno_p (REGNO (op0))
7412           && poly_int_rtx_p (op1, &offset))
7413         {
7414           info->type = ADDRESS_REG_IMM;
7415           info->base = op0;
7416           info->offset = op1;
7417           info->const_offset = offset;
7418
7419           return true;
7420         }
7421
7422       if (maybe_ne (GET_MODE_SIZE (mode), 0)
7423           && aarch64_base_register_rtx_p (op0, strict_p)
7424           && poly_int_rtx_p (op1, &offset))
7425         {
7426           info->type = ADDRESS_REG_IMM;
7427           info->base = op0;
7428           info->offset = op1;
7429           info->const_offset = offset;
7430
7431           /* TImode and TFmode values are allowed in both pairs of X
7432              registers and individual Q registers.  The available
7433              address modes are:
7434              X,X: 7-bit signed scaled offset
7435              Q:   9-bit signed offset
7436              We conservatively require an offset representable in either mode.
7437              When performing the check for pairs of X registers i.e.  LDP/STP
7438              pass down DImode since that is the natural size of the LDP/STP
7439              instruction memory accesses.  */
7440           if (mode == TImode || mode == TFmode)
7441             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
7442                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7443                         || offset_12bit_unsigned_scaled_p (mode, offset)));
7444
7445           /* A 7bit offset check because OImode will emit a ldp/stp
7446              instruction (only big endian will get here).
7447              For ldp/stp instructions, the offset is scaled for the size of a
7448              single element of the pair.  */
7449           if (mode == OImode)
7450             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
7451
7452           /* Three 9/12 bit offsets checks because CImode will emit three
7453              ldr/str instructions (only big endian will get here).  */
7454           if (mode == CImode)
7455             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7456                     && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
7457                                                                offset + 32)
7458                         || offset_12bit_unsigned_scaled_p (V16QImode,
7459                                                            offset + 32)));
7460
7461           /* Two 7bit offsets checks because XImode will emit two ldp/stp
7462              instructions (only big endian will get here).  */
7463           if (mode == XImode)
7464             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7465                     && aarch64_offset_7bit_signed_scaled_p (TImode,
7466                                                             offset + 32));
7467
7468           /* Make "m" use the LD1 offset range for SVE data modes, so
7469              that pre-RTL optimizers like ivopts will work to that
7470              instead of the wider LDR/STR range.  */
7471           if (vec_flags == VEC_SVE_DATA)
7472             return (type == ADDR_QUERY_M
7473                     ? offset_4bit_signed_scaled_p (mode, offset)
7474                     : offset_9bit_signed_scaled_p (mode, offset));
7475
7476           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
7477             {
7478               poly_int64 end_offset = (offset
7479                                        + GET_MODE_SIZE (mode)
7480                                        - BYTES_PER_SVE_VECTOR);
7481               return (type == ADDR_QUERY_M
7482                       ? offset_4bit_signed_scaled_p (mode, offset)
7483                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
7484                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
7485                                                          end_offset)));
7486             }
7487
7488           if (vec_flags == VEC_SVE_PRED)
7489             return offset_9bit_signed_scaled_p (mode, offset);
7490
7491           if (load_store_pair_p)
7492             return ((known_eq (GET_MODE_SIZE (mode), 4)
7493                      || known_eq (GET_MODE_SIZE (mode), 8)
7494                      || known_eq (GET_MODE_SIZE (mode), 16))
7495                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7496           else
7497             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7498                     || offset_12bit_unsigned_scaled_p (mode, offset));
7499         }
7500
7501       if (allow_reg_index_p)
7502         {
7503           /* Look for base + (scaled/extended) index register.  */
7504           if (aarch64_base_register_rtx_p (op0, strict_p)
7505               && aarch64_classify_index (info, op1, mode, strict_p))
7506             {
7507               info->base = op0;
7508               return true;
7509             }
7510           if (aarch64_base_register_rtx_p (op1, strict_p)
7511               && aarch64_classify_index (info, op0, mode, strict_p))
7512             {
7513               info->base = op1;
7514               return true;
7515             }
7516         }
7517
7518       return false;
7519
7520     case POST_INC:
7521     case POST_DEC:
7522     case PRE_INC:
7523     case PRE_DEC:
7524       info->type = ADDRESS_REG_WB;
7525       info->base = XEXP (x, 0);
7526       info->offset = NULL_RTX;
7527       return aarch64_base_register_rtx_p (info->base, strict_p);
7528
7529     case POST_MODIFY:
7530     case PRE_MODIFY:
7531       info->type = ADDRESS_REG_WB;
7532       info->base = XEXP (x, 0);
7533       if (GET_CODE (XEXP (x, 1)) == PLUS
7534           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
7535           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
7536           && aarch64_base_register_rtx_p (info->base, strict_p))
7537         {
7538           info->offset = XEXP (XEXP (x, 1), 1);
7539           info->const_offset = offset;
7540
7541           /* TImode and TFmode values are allowed in both pairs of X
7542              registers and individual Q registers.  The available
7543              address modes are:
7544              X,X: 7-bit signed scaled offset
7545              Q:   9-bit signed offset
7546              We conservatively require an offset representable in either mode.
7547            */
7548           if (mode == TImode || mode == TFmode)
7549             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
7550                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
7551
7552           if (load_store_pair_p)
7553             return ((known_eq (GET_MODE_SIZE (mode), 4)
7554                      || known_eq (GET_MODE_SIZE (mode), 8)
7555                      || known_eq (GET_MODE_SIZE (mode), 16))
7556                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7557           else
7558             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
7559         }
7560       return false;
7561
7562     case CONST:
7563     case SYMBOL_REF:
7564     case LABEL_REF:
7565       /* load literal: pc-relative constant pool entry.  Only supported
7566          for SI mode or larger.  */
7567       info->type = ADDRESS_SYMBOLIC;
7568
7569       if (!load_store_pair_p
7570           && GET_MODE_SIZE (mode).is_constant (&const_size)
7571           && const_size >= 4)
7572         {
7573           rtx sym, addend;
7574
7575           split_const (x, &sym, &addend);
7576           return ((GET_CODE (sym) == LABEL_REF
7577                    || (GET_CODE (sym) == SYMBOL_REF
7578                        && CONSTANT_POOL_ADDRESS_P (sym)
7579                        && aarch64_pcrelative_literal_loads)));
7580         }
7581       return false;
7582
7583     case LO_SUM:
7584       info->type = ADDRESS_LO_SUM;
7585       info->base = XEXP (x, 0);
7586       info->offset = XEXP (x, 1);
7587       if (allow_reg_index_p
7588           && aarch64_base_register_rtx_p (info->base, strict_p))
7589         {
7590           rtx sym, offs;
7591           split_const (info->offset, &sym, &offs);
7592           if (GET_CODE (sym) == SYMBOL_REF
7593               && (aarch64_classify_symbol (sym, INTVAL (offs))
7594                   == SYMBOL_SMALL_ABSOLUTE))
7595             {
7596               /* The symbol and offset must be aligned to the access size.  */
7597               unsigned int align;
7598
7599               if (CONSTANT_POOL_ADDRESS_P (sym))
7600                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
7601               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
7602                 {
7603                   tree exp = SYMBOL_REF_DECL (sym);
7604                   align = TYPE_ALIGN (TREE_TYPE (exp));
7605                   align = aarch64_constant_alignment (exp, align);
7606                 }
7607               else if (SYMBOL_REF_DECL (sym))
7608                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
7609               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
7610                        && SYMBOL_REF_BLOCK (sym) != NULL)
7611                 align = SYMBOL_REF_BLOCK (sym)->alignment;
7612               else
7613                 align = BITS_PER_UNIT;
7614
7615               poly_int64 ref_size = GET_MODE_SIZE (mode);
7616               if (known_eq (ref_size, 0))
7617                 ref_size = GET_MODE_SIZE (DImode);
7618
7619               return (multiple_p (INTVAL (offs), ref_size)
7620                       && multiple_p (align / BITS_PER_UNIT, ref_size));
7621             }
7622         }
7623       return false;
7624
7625     default:
7626       return false;
7627     }
7628 }
7629
7630 /* Return true if the address X is valid for a PRFM instruction.
7631    STRICT_P is true if we should do strict checking with
7632    aarch64_classify_address.  */
7633
7634 bool
7635 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
7636 {
7637   struct aarch64_address_info addr;
7638
7639   /* PRFM accepts the same addresses as DImode...  */
7640   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
7641   if (!res)
7642     return false;
7643
7644   /* ... except writeback forms.  */
7645   return addr.type != ADDRESS_REG_WB;
7646 }
7647
7648 bool
7649 aarch64_symbolic_address_p (rtx x)
7650 {
7651   rtx offset;
7652
7653   split_const (x, &x, &offset);
7654   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
7655 }
7656
7657 /* Classify the base of symbolic expression X.  */
7658
7659 enum aarch64_symbol_type
7660 aarch64_classify_symbolic_expression (rtx x)
7661 {
7662   rtx offset;
7663
7664   split_const (x, &x, &offset);
7665   return aarch64_classify_symbol (x, INTVAL (offset));
7666 }
7667
7668
7669 /* Return TRUE if X is a legitimate address for accessing memory in
7670    mode MODE.  */
7671 static bool
7672 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
7673 {
7674   struct aarch64_address_info addr;
7675
7676   return aarch64_classify_address (&addr, x, mode, strict_p);
7677 }
7678
7679 /* Return TRUE if X is a legitimate address of type TYPE for accessing
7680    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
7681 bool
7682 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
7683                               aarch64_addr_query_type type)
7684 {
7685   struct aarch64_address_info addr;
7686
7687   return aarch64_classify_address (&addr, x, mode, strict_p, type);
7688 }
7689
7690 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
7691
7692 static bool
7693 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
7694                                          poly_int64 orig_offset,
7695                                          machine_mode mode)
7696 {
7697   HOST_WIDE_INT size;
7698   if (GET_MODE_SIZE (mode).is_constant (&size))
7699     {
7700       HOST_WIDE_INT const_offset, second_offset;
7701
7702       /* A general SVE offset is A * VQ + B.  Remove the A component from
7703          coefficient 0 in order to get the constant B.  */
7704       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
7705
7706       /* Split an out-of-range address displacement into a base and
7707          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
7708          range otherwise to increase opportunities for sharing the base
7709          address of different sizes.  Unaligned accesses use the signed
7710          9-bit range, TImode/TFmode use the intersection of signed
7711          scaled 7-bit and signed 9-bit offset.  */
7712       if (mode == TImode || mode == TFmode)
7713         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
7714       else if ((const_offset & (size - 1)) != 0)
7715         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
7716       else
7717         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
7718
7719       if (second_offset == 0 || known_eq (orig_offset, second_offset))
7720         return false;
7721
7722       /* Split the offset into second_offset and the rest.  */
7723       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7724       *offset2 = gen_int_mode (second_offset, Pmode);
7725       return true;
7726     }
7727   else
7728     {
7729       /* Get the mode we should use as the basis of the range.  For structure
7730          modes this is the mode of one vector.  */
7731       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7732       machine_mode step_mode
7733         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
7734
7735       /* Get the "mul vl" multiplier we'd like to use.  */
7736       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
7737       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
7738       if (vec_flags & VEC_SVE_DATA)
7739         /* LDR supports a 9-bit range, but the move patterns for
7740            structure modes require all vectors to be in range of the
7741            same base.  The simplest way of accomodating that while still
7742            promoting reuse of anchor points between different modes is
7743            to use an 8-bit range unconditionally.  */
7744         vnum = ((vnum + 128) & 255) - 128;
7745       else
7746         /* Predicates are only handled singly, so we might as well use
7747            the full range.  */
7748         vnum = ((vnum + 256) & 511) - 256;
7749       if (vnum == 0)
7750         return false;
7751
7752       /* Convert the "mul vl" multiplier into a byte offset.  */
7753       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
7754       if (known_eq (second_offset, orig_offset))
7755         return false;
7756
7757       /* Split the offset into second_offset and the rest.  */
7758       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7759       *offset2 = gen_int_mode (second_offset, Pmode);
7760       return true;
7761     }
7762 }
7763
7764 /* Return the binary representation of floating point constant VALUE in INTVAL.
7765    If the value cannot be converted, return false without setting INTVAL.
7766    The conversion is done in the given MODE.  */
7767 bool
7768 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
7769 {
7770
7771   /* We make a general exception for 0.  */
7772   if (aarch64_float_const_zero_rtx_p (value))
7773     {
7774       *intval = 0;
7775       return true;
7776     }
7777
7778   scalar_float_mode mode;
7779   if (GET_CODE (value) != CONST_DOUBLE
7780       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
7781       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
7782       /* Only support up to DF mode.  */
7783       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
7784     return false;
7785
7786   unsigned HOST_WIDE_INT ival = 0;
7787
7788   long res[2];
7789   real_to_target (res,
7790                   CONST_DOUBLE_REAL_VALUE (value),
7791                   REAL_MODE_FORMAT (mode));
7792
7793   if (mode == DFmode)
7794     {
7795       int order = BYTES_BIG_ENDIAN ? 1 : 0;
7796       ival = zext_hwi (res[order], 32);
7797       ival |= (zext_hwi (res[1 - order], 32) << 32);
7798     }
7799   else
7800       ival = zext_hwi (res[0], 32);
7801
7802   *intval = ival;
7803   return true;
7804 }
7805
7806 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7807    single MOV(+MOVK) followed by an FMOV.  */
7808 bool
7809 aarch64_float_const_rtx_p (rtx x)
7810 {
7811   machine_mode mode = GET_MODE (x);
7812   if (mode == VOIDmode)
7813     return false;
7814
7815   /* Determine whether it's cheaper to write float constants as
7816      mov/movk pairs over ldr/adrp pairs.  */
7817   unsigned HOST_WIDE_INT ival;
7818
7819   if (GET_CODE (x) == CONST_DOUBLE
7820       && SCALAR_FLOAT_MODE_P (mode)
7821       && aarch64_reinterpret_float_as_int (x, &ival))
7822     {
7823       scalar_int_mode imode = (mode == HFmode
7824                                ? SImode
7825                                : int_mode_for_mode (mode).require ());
7826       int num_instr = aarch64_internal_mov_immediate
7827                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7828       return num_instr < 3;
7829     }
7830
7831   return false;
7832 }
7833
7834 /* Return TRUE if rtx X is immediate constant 0.0 */
7835 bool
7836 aarch64_float_const_zero_rtx_p (rtx x)
7837 {
7838   if (GET_MODE (x) == VOIDmode)
7839     return false;
7840
7841   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7842     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7843   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7844 }
7845
7846 /* Return TRUE if rtx X is immediate constant that fits in a single
7847    MOVI immediate operation.  */
7848 bool
7849 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7850 {
7851   if (!TARGET_SIMD)
7852      return false;
7853
7854   machine_mode vmode;
7855   scalar_int_mode imode;
7856   unsigned HOST_WIDE_INT ival;
7857
7858   if (GET_CODE (x) == CONST_DOUBLE
7859       && SCALAR_FLOAT_MODE_P (mode))
7860     {
7861       if (!aarch64_reinterpret_float_as_int (x, &ival))
7862         return false;
7863
7864       /* We make a general exception for 0.  */
7865       if (aarch64_float_const_zero_rtx_p (x))
7866         return true;
7867
7868       imode = int_mode_for_mode (mode).require ();
7869     }
7870   else if (GET_CODE (x) == CONST_INT
7871            && is_a <scalar_int_mode> (mode, &imode))
7872     ival = INTVAL (x);
7873   else
7874     return false;
7875
7876    /* use a 64 bit mode for everything except for DI/DF mode, where we use
7877      a 128 bit vector mode.  */
7878   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7879
7880   vmode = aarch64_simd_container_mode (imode, width);
7881   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7882
7883   return aarch64_simd_valid_immediate (v_op, NULL);
7884 }
7885
7886
7887 /* Return the fixed registers used for condition codes.  */
7888
7889 static bool
7890 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7891 {
7892   *p1 = CC_REGNUM;
7893   *p2 = INVALID_REGNUM;
7894   return true;
7895 }
7896
7897 /* This function is used by the call expanders of the machine description.
7898    RESULT is the register in which the result is returned.  It's NULL for
7899    "call" and "sibcall".
7900    MEM is the location of the function call.
7901    SIBCALL indicates whether this function call is normal call or sibling call.
7902    It will generate different pattern accordingly.  */
7903
7904 void
7905 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
7906 {
7907   rtx call, callee, tmp;
7908   rtvec vec;
7909   machine_mode mode;
7910
7911   gcc_assert (MEM_P (mem));
7912   callee = XEXP (mem, 0);
7913   mode = GET_MODE (callee);
7914   gcc_assert (mode == Pmode);
7915
7916   /* Decide if we should generate indirect calls by loading the
7917      address of the callee into a register before performing
7918      the branch-and-link.  */
7919   if (SYMBOL_REF_P (callee)
7920       ? (aarch64_is_long_call_p (callee)
7921          || aarch64_is_noplt_call_p (callee))
7922       : !REG_P (callee))
7923     XEXP (mem, 0) = force_reg (mode, callee);
7924
7925   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
7926
7927   if (result != NULL_RTX)
7928     call = gen_rtx_SET (result, call);
7929
7930   if (sibcall)
7931     tmp = ret_rtx;
7932   else
7933     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
7934
7935   vec = gen_rtvec (2, call, tmp);
7936   call = gen_rtx_PARALLEL (VOIDmode, vec);
7937
7938   aarch64_emit_call_insn (call);
7939 }
7940
7941 /* Emit call insn with PAT and do aarch64-specific handling.  */
7942
7943 void
7944 aarch64_emit_call_insn (rtx pat)
7945 {
7946   rtx insn = emit_call_insn (pat);
7947
7948   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
7949   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
7950   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
7951 }
7952
7953 machine_mode
7954 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
7955 {
7956   machine_mode mode_x = GET_MODE (x);
7957   rtx_code code_x = GET_CODE (x);
7958
7959   /* All floating point compares return CCFP if it is an equality
7960      comparison, and CCFPE otherwise.  */
7961   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
7962     {
7963       switch (code)
7964         {
7965         case EQ:
7966         case NE:
7967         case UNORDERED:
7968         case ORDERED:
7969         case UNLT:
7970         case UNLE:
7971         case UNGT:
7972         case UNGE:
7973         case UNEQ:
7974           return CCFPmode;
7975
7976         case LT:
7977         case LE:
7978         case GT:
7979         case GE:
7980         case LTGT:
7981           return CCFPEmode;
7982
7983         default:
7984           gcc_unreachable ();
7985         }
7986     }
7987
7988   /* Equality comparisons of short modes against zero can be performed
7989      using the TST instruction with the appropriate bitmask.  */
7990   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
7991       && (code == EQ || code == NE)
7992       && (mode_x == HImode || mode_x == QImode))
7993     return CC_NZmode;
7994
7995   /* Similarly, comparisons of zero_extends from shorter modes can
7996      be performed using an ANDS with an immediate mask.  */
7997   if (y == const0_rtx && code_x == ZERO_EXTEND
7998       && (mode_x == SImode || mode_x == DImode)
7999       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
8000       && (code == EQ || code == NE))
8001     return CC_NZmode;
8002
8003   if ((mode_x == SImode || mode_x == DImode)
8004       && y == const0_rtx
8005       && (code == EQ || code == NE || code == LT || code == GE)
8006       && (code_x == PLUS || code_x == MINUS || code_x == AND
8007           || code_x == NEG
8008           || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
8009               && CONST_INT_P (XEXP (x, 2)))))
8010     return CC_NZmode;
8011
8012   /* A compare with a shifted operand.  Because of canonicalization,
8013      the comparison will have to be swapped when we emit the assembly
8014      code.  */
8015   if ((mode_x == SImode || mode_x == DImode)
8016       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
8017       && (code_x == ASHIFT || code_x == ASHIFTRT
8018           || code_x == LSHIFTRT
8019           || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
8020     return CC_SWPmode;
8021
8022   /* Similarly for a negated operand, but we can only do this for
8023      equalities.  */
8024   if ((mode_x == SImode || mode_x == DImode)
8025       && (REG_P (y) || GET_CODE (y) == SUBREG)
8026       && (code == EQ || code == NE)
8027       && code_x == NEG)
8028     return CC_Zmode;
8029
8030   /* A test for unsigned overflow from an addition.  */
8031   if ((mode_x == DImode || mode_x == TImode)
8032       && (code == LTU || code == GEU)
8033       && code_x == PLUS
8034       && rtx_equal_p (XEXP (x, 0), y))
8035     return CC_Cmode;
8036
8037   /* A test for unsigned overflow from an add with carry.  */
8038   if ((mode_x == DImode || mode_x == TImode)
8039       && (code == LTU || code == GEU)
8040       && code_x == PLUS
8041       && CONST_SCALAR_INT_P (y)
8042       && (rtx_mode_t (y, mode_x)
8043           == (wi::shwi (1, mode_x)
8044               << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
8045     return CC_ADCmode;
8046
8047   /* A test for signed overflow.  */
8048   if ((mode_x == DImode || mode_x == TImode)
8049       && code == NE
8050       && code_x == PLUS
8051       && GET_CODE (y) == SIGN_EXTEND)
8052     return CC_Vmode;
8053
8054   /* For everything else, return CCmode.  */
8055   return CCmode;
8056 }
8057
8058 static int
8059 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
8060
8061 int
8062 aarch64_get_condition_code (rtx x)
8063 {
8064   machine_mode mode = GET_MODE (XEXP (x, 0));
8065   enum rtx_code comp_code = GET_CODE (x);
8066
8067   if (GET_MODE_CLASS (mode) != MODE_CC)
8068     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
8069   return aarch64_get_condition_code_1 (mode, comp_code);
8070 }
8071
8072 static int
8073 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
8074 {
8075   switch (mode)
8076     {
8077     case E_CCFPmode:
8078     case E_CCFPEmode:
8079       switch (comp_code)
8080         {
8081         case GE: return AARCH64_GE;
8082         case GT: return AARCH64_GT;
8083         case LE: return AARCH64_LS;
8084         case LT: return AARCH64_MI;
8085         case NE: return AARCH64_NE;
8086         case EQ: return AARCH64_EQ;
8087         case ORDERED: return AARCH64_VC;
8088         case UNORDERED: return AARCH64_VS;
8089         case UNLT: return AARCH64_LT;
8090         case UNLE: return AARCH64_LE;
8091         case UNGT: return AARCH64_HI;
8092         case UNGE: return AARCH64_PL;
8093         default: return -1;
8094         }
8095       break;
8096
8097     case E_CCmode:
8098       switch (comp_code)
8099         {
8100         case NE: return AARCH64_NE;
8101         case EQ: return AARCH64_EQ;
8102         case GE: return AARCH64_GE;
8103         case GT: return AARCH64_GT;
8104         case LE: return AARCH64_LE;
8105         case LT: return AARCH64_LT;
8106         case GEU: return AARCH64_CS;
8107         case GTU: return AARCH64_HI;
8108         case LEU: return AARCH64_LS;
8109         case LTU: return AARCH64_CC;
8110         default: return -1;
8111         }
8112       break;
8113
8114     case E_CC_SWPmode:
8115       switch (comp_code)
8116         {
8117         case NE: return AARCH64_NE;
8118         case EQ: return AARCH64_EQ;
8119         case GE: return AARCH64_LE;
8120         case GT: return AARCH64_LT;
8121         case LE: return AARCH64_GE;
8122         case LT: return AARCH64_GT;
8123         case GEU: return AARCH64_LS;
8124         case GTU: return AARCH64_CC;
8125         case LEU: return AARCH64_CS;
8126         case LTU: return AARCH64_HI;
8127         default: return -1;
8128         }
8129       break;
8130
8131     case E_CC_NZCmode:
8132       switch (comp_code)
8133         {
8134         case NE: return AARCH64_NE; /* = any */
8135         case EQ: return AARCH64_EQ; /* = none */
8136         case GE: return AARCH64_PL; /* = nfrst */
8137         case LT: return AARCH64_MI; /* = first */
8138         case GEU: return AARCH64_CS; /* = nlast */
8139         case GTU: return AARCH64_HI; /* = pmore */
8140         case LEU: return AARCH64_LS; /* = plast */
8141         case LTU: return AARCH64_CC; /* = last */
8142         default: return -1;
8143         }
8144       break;
8145
8146     case E_CC_NZmode:
8147       switch (comp_code)
8148         {
8149         case NE: return AARCH64_NE;
8150         case EQ: return AARCH64_EQ;
8151         case GE: return AARCH64_PL;
8152         case LT: return AARCH64_MI;
8153         default: return -1;
8154         }
8155       break;
8156
8157     case E_CC_Zmode:
8158       switch (comp_code)
8159         {
8160         case NE: return AARCH64_NE;
8161         case EQ: return AARCH64_EQ;
8162         default: return -1;
8163         }
8164       break;
8165
8166     case E_CC_Cmode:
8167       switch (comp_code)
8168         {
8169         case LTU: return AARCH64_CS;
8170         case GEU: return AARCH64_CC;
8171         default: return -1;
8172         }
8173       break;
8174
8175     case E_CC_ADCmode:
8176       switch (comp_code)
8177         {
8178         case GEU: return AARCH64_CS;
8179         case LTU: return AARCH64_CC;
8180         default: return -1;
8181         }
8182       break;
8183
8184     case E_CC_Vmode:
8185       switch (comp_code)
8186         {
8187         case NE: return AARCH64_VS;
8188         case EQ: return AARCH64_VC;
8189         default: return -1;
8190         }
8191       break;
8192
8193     default:
8194       return -1;
8195     }
8196
8197   return -1;
8198 }
8199
8200 bool
8201 aarch64_const_vec_all_same_in_range_p (rtx x,
8202                                        HOST_WIDE_INT minval,
8203                                        HOST_WIDE_INT maxval)
8204 {
8205   rtx elt;
8206   return (const_vec_duplicate_p (x, &elt)
8207           && CONST_INT_P (elt)
8208           && IN_RANGE (INTVAL (elt), minval, maxval));
8209 }
8210
8211 bool
8212 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
8213 {
8214   return aarch64_const_vec_all_same_in_range_p (x, val, val);
8215 }
8216
8217 /* Return true if VEC is a constant in which every element is in the range
8218    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
8219
8220 static bool
8221 aarch64_const_vec_all_in_range_p (rtx vec,
8222                                   HOST_WIDE_INT minval,
8223                                   HOST_WIDE_INT maxval)
8224 {
8225   if (GET_CODE (vec) != CONST_VECTOR
8226       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
8227     return false;
8228
8229   int nunits;
8230   if (!CONST_VECTOR_STEPPED_P (vec))
8231     nunits = const_vector_encoded_nelts (vec);
8232   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
8233     return false;
8234
8235   for (int i = 0; i < nunits; i++)
8236     {
8237       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
8238       if (!CONST_INT_P (vec_elem)
8239           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
8240         return false;
8241     }
8242   return true;
8243 }
8244
8245 /* N Z C V.  */
8246 #define AARCH64_CC_V 1
8247 #define AARCH64_CC_C (1 << 1)
8248 #define AARCH64_CC_Z (1 << 2)
8249 #define AARCH64_CC_N (1 << 3)
8250
8251 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
8252 static const int aarch64_nzcv_codes[] =
8253 {
8254   0,            /* EQ, Z == 1.  */
8255   AARCH64_CC_Z, /* NE, Z == 0.  */
8256   0,            /* CS, C == 1.  */
8257   AARCH64_CC_C, /* CC, C == 0.  */
8258   0,            /* MI, N == 1.  */
8259   AARCH64_CC_N, /* PL, N == 0.  */
8260   0,            /* VS, V == 1.  */
8261   AARCH64_CC_V, /* VC, V == 0.  */
8262   0,            /* HI, C ==1 && Z == 0.  */
8263   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
8264   AARCH64_CC_V, /* GE, N == V.  */
8265   0,            /* LT, N != V.  */
8266   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
8267   0,            /* LE, !(Z == 0 && N == V).  */
8268   0,            /* AL, Any.  */
8269   0             /* NV, Any.  */
8270 };
8271
8272 /* Print floating-point vector immediate operand X to F, negating it
8273    first if NEGATE is true.  Return true on success, false if it isn't
8274    a constant we can handle.  */
8275
8276 static bool
8277 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
8278 {
8279   rtx elt;
8280
8281   if (!const_vec_duplicate_p (x, &elt))
8282     return false;
8283
8284   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
8285   if (negate)
8286     r = real_value_negate (&r);
8287
8288   /* Handle the SVE single-bit immediates specially, since they have a
8289      fixed form in the assembly syntax.  */
8290   if (real_equal (&r, &dconst0))
8291     asm_fprintf (f, "0.0");
8292   else if (real_equal (&r, &dconst2))
8293     asm_fprintf (f, "2.0");
8294   else if (real_equal (&r, &dconst1))
8295     asm_fprintf (f, "1.0");
8296   else if (real_equal (&r, &dconsthalf))
8297     asm_fprintf (f, "0.5");
8298   else
8299     {
8300       const int buf_size = 20;
8301       char float_buf[buf_size] = {'\0'};
8302       real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
8303                                 1, GET_MODE (elt));
8304       asm_fprintf (f, "%s", float_buf);
8305     }
8306
8307   return true;
8308 }
8309
8310 /* Return the equivalent letter for size.  */
8311 static char
8312 sizetochar (int size)
8313 {
8314   switch (size)
8315     {
8316     case 64: return 'd';
8317     case 32: return 's';
8318     case 16: return 'h';
8319     case 8 : return 'b';
8320     default: gcc_unreachable ();
8321     }
8322 }
8323
8324 /* Print operand X to file F in a target specific manner according to CODE.
8325    The acceptable formatting commands given by CODE are:
8326      'c':               An integer or symbol address without a preceding #
8327                         sign.
8328      'C':               Take the duplicated element in a vector constant
8329                         and print it in hex.
8330      'D':               Take the duplicated element in a vector constant
8331                         and print it as an unsigned integer, in decimal.
8332      'e':               Print the sign/zero-extend size as a character 8->b,
8333                         16->h, 32->w.  Can also be used for masks:
8334                         0xff->b, 0xffff->h, 0xffffffff->w.
8335      'I':               If the operand is a duplicated vector constant,
8336                         replace it with the duplicated scalar.  If the
8337                         operand is then a floating-point constant, replace
8338                         it with the integer bit representation.  Print the
8339                         transformed constant as a signed decimal number.
8340      'p':               Prints N such that 2^N == X (X must be power of 2 and
8341                         const int).
8342      'P':               Print the number of non-zero bits in X (a const_int).
8343      'H':               Print the higher numbered register of a pair (TImode)
8344                         of regs.
8345      'm':               Print a condition (eq, ne, etc).
8346      'M':               Same as 'm', but invert condition.
8347      'N':               Take the duplicated element in a vector constant
8348                         and print the negative of it in decimal.
8349      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
8350      'S/T/U/V':         Print a FP/SIMD register name for a register list.
8351                         The register printed is the FP/SIMD register name
8352                         of X + 0/1/2/3 for S/T/U/V.
8353      'R':               Print a scalar FP/SIMD register name + 1.
8354      'X':               Print bottom 16 bits of integer constant in hex.
8355      'w/x':             Print a general register name or the zero register
8356                         (32-bit or 64-bit).
8357      '0':               Print a normal operand, if it's a general register,
8358                         then we assume DImode.
8359      'k':               Print NZCV for conditional compare instructions.
8360      'A':               Output address constant representing the first
8361                         argument of X, specifying a relocation offset
8362                         if appropriate.
8363      'L':               Output constant address specified by X
8364                         with a relocation offset if appropriate.
8365      'G':               Prints address of X, specifying a PC relative
8366                         relocation mode if appropriate.
8367      'y':               Output address of LDP or STP - this is used for
8368                         some LDP/STPs which don't use a PARALLEL in their
8369                         pattern (so the mode needs to be adjusted).
8370      'z':               Output address of a typical LDP or STP.  */
8371
8372 static void
8373 aarch64_print_operand (FILE *f, rtx x, int code)
8374 {
8375   rtx elt;
8376   switch (code)
8377     {
8378     case 'c':
8379       switch (GET_CODE (x))
8380         {
8381         case CONST_INT:
8382           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
8383           break;
8384
8385         case SYMBOL_REF:
8386           output_addr_const (f, x);
8387           break;
8388
8389         case CONST:
8390           if (GET_CODE (XEXP (x, 0)) == PLUS
8391               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
8392             {
8393               output_addr_const (f, x);
8394               break;
8395             }
8396           /* Fall through.  */
8397
8398         default:
8399           output_operand_lossage ("unsupported operand for code '%c'", code);
8400         }
8401       break;
8402
8403     case 'e':
8404       {
8405         x = unwrap_const_vec_duplicate (x);
8406         if (!CONST_INT_P (x))
8407           {
8408             output_operand_lossage ("invalid operand for '%%%c'", code);
8409             return;
8410           }
8411
8412         HOST_WIDE_INT val = INTVAL (x);
8413         if ((val & ~7) == 8 || val == 0xff)
8414           fputc ('b', f);
8415         else if ((val & ~7) == 16 || val == 0xffff)
8416           fputc ('h', f);
8417         else if ((val & ~7) == 32 || val == 0xffffffff)
8418           fputc ('w', f);
8419         else
8420           {
8421             output_operand_lossage ("invalid operand for '%%%c'", code);
8422             return;
8423           }
8424       }
8425       break;
8426
8427     case 'p':
8428       {
8429         int n;
8430
8431         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
8432           {
8433             output_operand_lossage ("invalid operand for '%%%c'", code);
8434             return;
8435           }
8436
8437         asm_fprintf (f, "%d", n);
8438       }
8439       break;
8440
8441     case 'P':
8442       if (!CONST_INT_P (x))
8443         {
8444           output_operand_lossage ("invalid operand for '%%%c'", code);
8445           return;
8446         }
8447
8448       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
8449       break;
8450
8451     case 'H':
8452       if (x == const0_rtx)
8453         {
8454           asm_fprintf (f, "xzr");
8455           break;
8456         }
8457
8458       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
8459         {
8460           output_operand_lossage ("invalid operand for '%%%c'", code);
8461           return;
8462         }
8463
8464       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
8465       break;
8466
8467     case 'I':
8468       {
8469         x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
8470         if (CONST_INT_P (x))
8471           asm_fprintf (f, "%wd", INTVAL (x));
8472         else
8473           {
8474             output_operand_lossage ("invalid operand for '%%%c'", code);
8475             return;
8476           }
8477         break;
8478       }
8479
8480     case 'M':
8481     case 'm':
8482       {
8483         int cond_code;
8484         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
8485         if (x == const_true_rtx)
8486           {
8487             if (code == 'M')
8488               fputs ("nv", f);
8489             return;
8490           }
8491
8492         if (!COMPARISON_P (x))
8493           {
8494             output_operand_lossage ("invalid operand for '%%%c'", code);
8495             return;
8496           }
8497
8498         cond_code = aarch64_get_condition_code (x);
8499         gcc_assert (cond_code >= 0);
8500         if (code == 'M')
8501           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
8502         if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
8503           fputs (aarch64_sve_condition_codes[cond_code], f);
8504         else
8505           fputs (aarch64_condition_codes[cond_code], f);
8506       }
8507       break;
8508
8509     case 'N':
8510       if (!const_vec_duplicate_p (x, &elt))
8511         {
8512           output_operand_lossage ("invalid vector constant");
8513           return;
8514         }
8515
8516       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8517         asm_fprintf (f, "%wd", -INTVAL (elt));
8518       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8519                && aarch64_print_vector_float_operand (f, x, true))
8520         ;
8521       else
8522         {
8523           output_operand_lossage ("invalid vector constant");
8524           return;
8525         }
8526       break;
8527
8528     case 'b':
8529     case 'h':
8530     case 's':
8531     case 'd':
8532     case 'q':
8533       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8534         {
8535           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8536           return;
8537         }
8538       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
8539       break;
8540
8541     case 'S':
8542     case 'T':
8543     case 'U':
8544     case 'V':
8545       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8546         {
8547           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8548           return;
8549         }
8550       asm_fprintf (f, "%c%d",
8551                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
8552                    REGNO (x) - V0_REGNUM + (code - 'S'));
8553       break;
8554
8555     case 'R':
8556       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8557         {
8558           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8559           return;
8560         }
8561       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
8562       break;
8563
8564     case 'X':
8565       if (!CONST_INT_P (x))
8566         {
8567           output_operand_lossage ("invalid operand for '%%%c'", code);
8568           return;
8569         }
8570       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
8571       break;
8572
8573     case 'C':
8574       {
8575         /* Print a replicated constant in hex.  */
8576         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8577           {
8578             output_operand_lossage ("invalid operand for '%%%c'", code);
8579             return;
8580           }
8581         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8582         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8583       }
8584       break;
8585
8586     case 'D':
8587       {
8588         /* Print a replicated constant in decimal, treating it as
8589            unsigned.  */
8590         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8591           {
8592             output_operand_lossage ("invalid operand for '%%%c'", code);
8593             return;
8594           }
8595         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8596         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8597       }
8598       break;
8599
8600     case 'w':
8601     case 'x':
8602       if (x == const0_rtx
8603           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
8604         {
8605           asm_fprintf (f, "%czr", code);
8606           break;
8607         }
8608
8609       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
8610         {
8611           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
8612           break;
8613         }
8614
8615       if (REG_P (x) && REGNO (x) == SP_REGNUM)
8616         {
8617           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
8618           break;
8619         }
8620
8621       /* Fall through */
8622
8623     case 0:
8624       if (x == NULL)
8625         {
8626           output_operand_lossage ("missing operand");
8627           return;
8628         }
8629
8630       switch (GET_CODE (x))
8631         {
8632         case REG:
8633           if (aarch64_sve_data_mode_p (GET_MODE (x)))
8634             {
8635               if (REG_NREGS (x) == 1)
8636                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
8637               else
8638                 {
8639                   char suffix
8640                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
8641                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
8642                                REGNO (x) - V0_REGNUM, suffix,
8643                                END_REGNO (x) - V0_REGNUM - 1, suffix);
8644                 }
8645             }
8646           else
8647             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
8648           break;
8649
8650         case MEM:
8651           output_address (GET_MODE (x), XEXP (x, 0));
8652           break;
8653
8654         case LABEL_REF:
8655         case SYMBOL_REF:
8656           output_addr_const (asm_out_file, x);
8657           break;
8658
8659         case CONST_INT:
8660           asm_fprintf (f, "%wd", INTVAL (x));
8661           break;
8662
8663         case CONST:
8664           if (!VECTOR_MODE_P (GET_MODE (x)))
8665             {
8666               output_addr_const (asm_out_file, x);
8667               break;
8668             }
8669           /* fall through */
8670
8671         case CONST_VECTOR:
8672           if (!const_vec_duplicate_p (x, &elt))
8673             {
8674               output_operand_lossage ("invalid vector constant");
8675               return;
8676             }
8677
8678           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8679             asm_fprintf (f, "%wd", INTVAL (elt));
8680           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8681                    && aarch64_print_vector_float_operand (f, x, false))
8682             ;
8683           else
8684             {
8685               output_operand_lossage ("invalid vector constant");
8686               return;
8687             }
8688           break;
8689
8690         case CONST_DOUBLE:
8691           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
8692              be getting CONST_DOUBLEs holding integers.  */
8693           gcc_assert (GET_MODE (x) != VOIDmode);
8694           if (aarch64_float_const_zero_rtx_p (x))
8695             {
8696               fputc ('0', f);
8697               break;
8698             }
8699           else if (aarch64_float_const_representable_p (x))
8700             {
8701 #define buf_size 20
8702               char float_buf[buf_size] = {'\0'};
8703               real_to_decimal_for_mode (float_buf,
8704                                         CONST_DOUBLE_REAL_VALUE (x),
8705                                         buf_size, buf_size,
8706                                         1, GET_MODE (x));
8707               asm_fprintf (asm_out_file, "%s", float_buf);
8708               break;
8709 #undef buf_size
8710             }
8711           output_operand_lossage ("invalid constant");
8712           return;
8713         default:
8714           output_operand_lossage ("invalid operand");
8715           return;
8716         }
8717       break;
8718
8719     case 'A':
8720       if (GET_CODE (x) == HIGH)
8721         x = XEXP (x, 0);
8722
8723       switch (aarch64_classify_symbolic_expression (x))
8724         {
8725         case SYMBOL_SMALL_GOT_4G:
8726           asm_fprintf (asm_out_file, ":got:");
8727           break;
8728
8729         case SYMBOL_SMALL_TLSGD:
8730           asm_fprintf (asm_out_file, ":tlsgd:");
8731           break;
8732
8733         case SYMBOL_SMALL_TLSDESC:
8734           asm_fprintf (asm_out_file, ":tlsdesc:");
8735           break;
8736
8737         case SYMBOL_SMALL_TLSIE:
8738           asm_fprintf (asm_out_file, ":gottprel:");
8739           break;
8740
8741         case SYMBOL_TLSLE24:
8742           asm_fprintf (asm_out_file, ":tprel:");
8743           break;
8744
8745         case SYMBOL_TINY_GOT:
8746           gcc_unreachable ();
8747           break;
8748
8749         default:
8750           break;
8751         }
8752       output_addr_const (asm_out_file, x);
8753       break;
8754
8755     case 'L':
8756       switch (aarch64_classify_symbolic_expression (x))
8757         {
8758         case SYMBOL_SMALL_GOT_4G:
8759           asm_fprintf (asm_out_file, ":lo12:");
8760           break;
8761
8762         case SYMBOL_SMALL_TLSGD:
8763           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
8764           break;
8765
8766         case SYMBOL_SMALL_TLSDESC:
8767           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
8768           break;
8769
8770         case SYMBOL_SMALL_TLSIE:
8771           asm_fprintf (asm_out_file, ":gottprel_lo12:");
8772           break;
8773
8774         case SYMBOL_TLSLE12:
8775           asm_fprintf (asm_out_file, ":tprel_lo12:");
8776           break;
8777
8778         case SYMBOL_TLSLE24:
8779           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
8780           break;
8781
8782         case SYMBOL_TINY_GOT:
8783           asm_fprintf (asm_out_file, ":got:");
8784           break;
8785
8786         case SYMBOL_TINY_TLSIE:
8787           asm_fprintf (asm_out_file, ":gottprel:");
8788           break;
8789
8790         default:
8791           break;
8792         }
8793       output_addr_const (asm_out_file, x);
8794       break;
8795
8796     case 'G':
8797       switch (aarch64_classify_symbolic_expression (x))
8798         {
8799         case SYMBOL_TLSLE24:
8800           asm_fprintf (asm_out_file, ":tprel_hi12:");
8801           break;
8802         default:
8803           break;
8804         }
8805       output_addr_const (asm_out_file, x);
8806       break;
8807
8808     case 'k':
8809       {
8810         HOST_WIDE_INT cond_code;
8811
8812         if (!CONST_INT_P (x))
8813           {
8814             output_operand_lossage ("invalid operand for '%%%c'", code);
8815             return;
8816           }
8817
8818         cond_code = INTVAL (x);
8819         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
8820         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
8821       }
8822       break;
8823
8824     case 'y':
8825     case 'z':
8826       {
8827         machine_mode mode = GET_MODE (x);
8828
8829         if (GET_CODE (x) != MEM
8830             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
8831           {
8832             output_operand_lossage ("invalid operand for '%%%c'", code);
8833             return;
8834           }
8835
8836         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
8837                                             code == 'y'
8838                                             ? ADDR_QUERY_LDP_STP_N
8839                                             : ADDR_QUERY_LDP_STP))
8840           output_operand_lossage ("invalid operand prefix '%%%c'", code);
8841       }
8842       break;
8843
8844     default:
8845       output_operand_lossage ("invalid operand prefix '%%%c'", code);
8846       return;
8847     }
8848 }
8849
8850 /* Print address 'x' of a memory access with mode 'mode'.
8851    'op' is the context required by aarch64_classify_address.  It can either be
8852    MEM for a normal memory access or PARALLEL for LDP/STP.  */
8853 static bool
8854 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
8855                                 aarch64_addr_query_type type)
8856 {
8857   struct aarch64_address_info addr;
8858   unsigned int size;
8859
8860   /* Check all addresses are Pmode - including ILP32.  */
8861   if (GET_MODE (x) != Pmode
8862       && (!CONST_INT_P (x)
8863           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
8864     {
8865       output_operand_lossage ("invalid address mode");
8866       return false;
8867     }
8868
8869   if (aarch64_classify_address (&addr, x, mode, true, type))
8870     switch (addr.type)
8871       {
8872       case ADDRESS_REG_IMM:
8873         if (known_eq (addr.const_offset, 0))
8874           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8875         else if (aarch64_sve_data_mode_p (mode))
8876           {
8877             HOST_WIDE_INT vnum
8878               = exact_div (addr.const_offset,
8879                            BYTES_PER_SVE_VECTOR).to_constant ();
8880             asm_fprintf (f, "[%s, #%wd, mul vl]",
8881                          reg_names[REGNO (addr.base)], vnum);
8882           }
8883         else if (aarch64_sve_pred_mode_p (mode))
8884           {
8885             HOST_WIDE_INT vnum
8886               = exact_div (addr.const_offset,
8887                            BYTES_PER_SVE_PRED).to_constant ();
8888             asm_fprintf (f, "[%s, #%wd, mul vl]",
8889                          reg_names[REGNO (addr.base)], vnum);
8890           }
8891         else
8892           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
8893                        INTVAL (addr.offset));
8894         return true;
8895
8896       case ADDRESS_REG_REG:
8897         if (addr.shift == 0)
8898           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
8899                        reg_names [REGNO (addr.offset)]);
8900         else
8901           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
8902                        reg_names [REGNO (addr.offset)], addr.shift);
8903         return true;
8904
8905       case ADDRESS_REG_UXTW:
8906         if (addr.shift == 0)
8907           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
8908                        REGNO (addr.offset) - R0_REGNUM);
8909         else
8910           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
8911                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
8912         return true;
8913
8914       case ADDRESS_REG_SXTW:
8915         if (addr.shift == 0)
8916           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
8917                        REGNO (addr.offset) - R0_REGNUM);
8918         else
8919           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
8920                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
8921         return true;
8922
8923       case ADDRESS_REG_WB:
8924         /* Writeback is only supported for fixed-width modes.  */
8925         size = GET_MODE_SIZE (mode).to_constant ();
8926         switch (GET_CODE (x))
8927           {
8928           case PRE_INC:
8929             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
8930             return true;
8931           case POST_INC:
8932             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
8933             return true;
8934           case PRE_DEC:
8935             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
8936             return true;
8937           case POST_DEC:
8938             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
8939             return true;
8940           case PRE_MODIFY:
8941             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
8942                          INTVAL (addr.offset));
8943             return true;
8944           case POST_MODIFY:
8945             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
8946                          INTVAL (addr.offset));
8947             return true;
8948           default:
8949             break;
8950           }
8951         break;
8952
8953       case ADDRESS_LO_SUM:
8954         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
8955         output_addr_const (f, addr.offset);
8956         asm_fprintf (f, "]");
8957         return true;
8958
8959       case ADDRESS_SYMBOLIC:
8960         output_addr_const (f, x);
8961         return true;
8962       }
8963
8964   return false;
8965 }
8966
8967 /* Print address 'x' of a memory access with mode 'mode'.  */
8968 static void
8969 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
8970 {
8971   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
8972     output_addr_const (f, x);
8973 }
8974
8975 bool
8976 aarch64_label_mentioned_p (rtx x)
8977 {
8978   const char *fmt;
8979   int i;
8980
8981   if (GET_CODE (x) == LABEL_REF)
8982     return true;
8983
8984   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
8985      referencing instruction, but they are constant offsets, not
8986      symbols.  */
8987   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8988     return false;
8989
8990   fmt = GET_RTX_FORMAT (GET_CODE (x));
8991   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
8992     {
8993       if (fmt[i] == 'E')
8994         {
8995           int j;
8996
8997           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
8998             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
8999               return 1;
9000         }
9001       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
9002         return 1;
9003     }
9004
9005   return 0;
9006 }
9007
9008 /* Implement REGNO_REG_CLASS.  */
9009
9010 enum reg_class
9011 aarch64_regno_regclass (unsigned regno)
9012 {
9013   if (GP_REGNUM_P (regno))
9014     return GENERAL_REGS;
9015
9016   if (regno == SP_REGNUM)
9017     return STACK_REG;
9018
9019   if (regno == FRAME_POINTER_REGNUM
9020       || regno == ARG_POINTER_REGNUM)
9021     return POINTER_REGS;
9022
9023   if (FP_REGNUM_P (regno))
9024     return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
9025             : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
9026
9027   if (PR_REGNUM_P (regno))
9028     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
9029
9030   return NO_REGS;
9031 }
9032
9033 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
9034    If OFFSET is out of range, return an offset of an anchor point
9035    that is in range.  Return 0 otherwise.  */
9036
9037 static HOST_WIDE_INT
9038 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
9039                        machine_mode mode)
9040 {
9041   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
9042   if (size > 16)
9043     return (offset + 0x400) & ~0x7f0;
9044
9045   /* For offsets that aren't a multiple of the access size, the limit is
9046      -256...255.  */
9047   if (offset & (size - 1))
9048     {
9049       /* BLKmode typically uses LDP of X-registers.  */
9050       if (mode == BLKmode)
9051         return (offset + 512) & ~0x3ff;
9052       return (offset + 0x100) & ~0x1ff;
9053     }
9054
9055   /* Small negative offsets are supported.  */
9056   if (IN_RANGE (offset, -256, 0))
9057     return 0;
9058
9059   if (mode == TImode || mode == TFmode)
9060     return (offset + 0x100) & ~0x1ff;
9061
9062   /* Use 12-bit offset by access size.  */
9063   return offset & (~0xfff * size);
9064 }
9065
9066 static rtx
9067 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
9068 {
9069   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
9070      where mask is selected by alignment and size of the offset.
9071      We try to pick as large a range for the offset as possible to
9072      maximize the chance of a CSE.  However, for aligned addresses
9073      we limit the range to 4k so that structures with different sized
9074      elements are likely to use the same base.  We need to be careful
9075      not to split a CONST for some forms of address expression, otherwise
9076      it will generate sub-optimal code.  */
9077
9078   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
9079     {
9080       rtx base = XEXP (x, 0);
9081       rtx offset_rtx = XEXP (x, 1);
9082       HOST_WIDE_INT offset = INTVAL (offset_rtx);
9083
9084       if (GET_CODE (base) == PLUS)
9085         {
9086           rtx op0 = XEXP (base, 0);
9087           rtx op1 = XEXP (base, 1);
9088
9089           /* Force any scaling into a temp for CSE.  */
9090           op0 = force_reg (Pmode, op0);
9091           op1 = force_reg (Pmode, op1);
9092
9093           /* Let the pointer register be in op0.  */
9094           if (REG_POINTER (op1))
9095             std::swap (op0, op1);
9096
9097           /* If the pointer is virtual or frame related, then we know that
9098              virtual register instantiation or register elimination is going
9099              to apply a second constant.  We want the two constants folded
9100              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
9101           if (virt_or_elim_regno_p (REGNO (op0)))
9102             {
9103               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
9104                                    NULL_RTX, true, OPTAB_DIRECT);
9105               return gen_rtx_PLUS (Pmode, base, op1);
9106             }
9107
9108           /* Otherwise, in order to encourage CSE (and thence loop strength
9109              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
9110           base = expand_binop (Pmode, add_optab, op0, op1,
9111                                NULL_RTX, true, OPTAB_DIRECT);
9112           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
9113         }
9114
9115       HOST_WIDE_INT size;
9116       if (GET_MODE_SIZE (mode).is_constant (&size))
9117         {
9118           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
9119                                                              mode);
9120           if (base_offset != 0)
9121             {
9122               base = plus_constant (Pmode, base, base_offset);
9123               base = force_operand (base, NULL_RTX);
9124               return plus_constant (Pmode, base, offset - base_offset);
9125             }
9126         }
9127     }
9128
9129   return x;
9130 }
9131
9132 static reg_class_t
9133 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
9134                           reg_class_t rclass,
9135                           machine_mode mode,
9136                           secondary_reload_info *sri)
9137 {
9138   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
9139      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
9140      comment at the head of aarch64-sve.md for more details about the
9141      big-endian handling.  */
9142   if (BYTES_BIG_ENDIAN
9143       && reg_class_subset_p (rclass, FP_REGS)
9144       && !((REG_P (x) && HARD_REGISTER_P (x))
9145            || aarch64_simd_valid_immediate (x, NULL))
9146       && aarch64_sve_data_mode_p (mode))
9147     {
9148       sri->icode = CODE_FOR_aarch64_sve_reload_be;
9149       return NO_REGS;
9150     }
9151
9152   /* If we have to disable direct literal pool loads and stores because the
9153      function is too big, then we need a scratch register.  */
9154   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
9155       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
9156           || targetm.vector_mode_supported_p (GET_MODE (x)))
9157       && !aarch64_pcrelative_literal_loads)
9158     {
9159       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
9160       return NO_REGS;
9161     }
9162
9163   /* Without the TARGET_SIMD instructions we cannot move a Q register
9164      to a Q register directly.  We need a scratch.  */
9165   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
9166       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
9167       && reg_class_subset_p (rclass, FP_REGS))
9168     {
9169       sri->icode = code_for_aarch64_reload_mov (mode);
9170       return NO_REGS;
9171     }
9172
9173   /* A TFmode or TImode memory access should be handled via an FP_REGS
9174      because AArch64 has richer addressing modes for LDR/STR instructions
9175      than LDP/STP instructions.  */
9176   if (TARGET_FLOAT && rclass == GENERAL_REGS
9177       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
9178     return FP_REGS;
9179
9180   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
9181       return GENERAL_REGS;
9182
9183   return NO_REGS;
9184 }
9185
9186 static bool
9187 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
9188 {
9189   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
9190
9191   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
9192      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
9193   if (frame_pointer_needed)
9194     return to == HARD_FRAME_POINTER_REGNUM;
9195   return true;
9196 }
9197
9198 poly_int64
9199 aarch64_initial_elimination_offset (unsigned from, unsigned to)
9200 {
9201   if (to == HARD_FRAME_POINTER_REGNUM)
9202     {
9203       if (from == ARG_POINTER_REGNUM)
9204         return cfun->machine->frame.hard_fp_offset;
9205
9206       if (from == FRAME_POINTER_REGNUM)
9207         return cfun->machine->frame.hard_fp_offset
9208                - cfun->machine->frame.locals_offset;
9209     }
9210
9211   if (to == STACK_POINTER_REGNUM)
9212     {
9213       if (from == FRAME_POINTER_REGNUM)
9214           return cfun->machine->frame.frame_size
9215                  - cfun->machine->frame.locals_offset;
9216     }
9217
9218   return cfun->machine->frame.frame_size;
9219 }
9220
9221 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
9222    previous frame.  */
9223
9224 rtx
9225 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
9226 {
9227   if (count != 0)
9228     return const0_rtx;
9229   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
9230 }
9231
9232
9233 static void
9234 aarch64_asm_trampoline_template (FILE *f)
9235 {
9236   int offset1 = 16;
9237   int offset2 = 20;
9238
9239   if (aarch64_bti_enabled ())
9240     {
9241       asm_fprintf (f, "\thint\t34 // bti c\n");
9242       offset1 -= 4;
9243       offset2 -= 4;
9244     }
9245
9246   if (TARGET_ILP32)
9247     {
9248       asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
9249       asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
9250                    offset1);
9251     }
9252   else
9253     {
9254       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
9255       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
9256                    offset2);
9257     }
9258   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
9259
9260   /* The trampoline needs an extra padding instruction.  In case if BTI is
9261      enabled the padding instruction is replaced by the BTI instruction at
9262      the beginning.  */
9263   if (!aarch64_bti_enabled ())
9264     assemble_aligned_integer (4, const0_rtx);
9265
9266   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9267   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9268 }
9269
9270 static void
9271 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
9272 {
9273   rtx fnaddr, mem, a_tramp;
9274   const int tramp_code_sz = 16;
9275
9276   /* Don't need to copy the trailing D-words, we fill those in below.  */
9277   emit_block_move (m_tramp, assemble_trampoline_template (),
9278                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
9279   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
9280   fnaddr = XEXP (DECL_RTL (fndecl), 0);
9281   if (GET_MODE (fnaddr) != ptr_mode)
9282     fnaddr = convert_memory_address (ptr_mode, fnaddr);
9283   emit_move_insn (mem, fnaddr);
9284
9285   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
9286   emit_move_insn (mem, chain_value);
9287
9288   /* XXX We should really define a "clear_cache" pattern and use
9289      gen_clear_cache().  */
9290   a_tramp = XEXP (m_tramp, 0);
9291   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
9292                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
9293                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
9294                      ptr_mode);
9295 }
9296
9297 static unsigned char
9298 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
9299 {
9300   /* ??? Logically we should only need to provide a value when
9301      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
9302      can hold MODE, but at the moment we need to handle all modes.
9303      Just ignore any runtime parts for registers that can't store them.  */
9304   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
9305   unsigned int nregs;
9306   switch (regclass)
9307     {
9308     case TAILCALL_ADDR_REGS:
9309     case POINTER_REGS:
9310     case GENERAL_REGS:
9311     case ALL_REGS:
9312     case POINTER_AND_FP_REGS:
9313     case FP_REGS:
9314     case FP_LO_REGS:
9315     case FP_LO8_REGS:
9316       if (aarch64_sve_data_mode_p (mode)
9317           && constant_multiple_p (GET_MODE_SIZE (mode),
9318                                   BYTES_PER_SVE_VECTOR, &nregs))
9319         return nregs;
9320       return (aarch64_vector_data_mode_p (mode)
9321               ? CEIL (lowest_size, UNITS_PER_VREG)
9322               : CEIL (lowest_size, UNITS_PER_WORD));
9323     case STACK_REG:
9324     case PR_REGS:
9325     case PR_LO_REGS:
9326     case PR_HI_REGS:
9327       return 1;
9328
9329     case NO_REGS:
9330       return 0;
9331
9332     default:
9333       break;
9334     }
9335   gcc_unreachable ();
9336 }
9337
9338 static reg_class_t
9339 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
9340 {
9341   if (regclass == POINTER_REGS)
9342     return GENERAL_REGS;
9343
9344   if (regclass == STACK_REG)
9345     {
9346       if (REG_P(x)
9347           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
9348           return regclass;
9349
9350       return NO_REGS;
9351     }
9352
9353   /* Register eliminiation can result in a request for
9354      SP+constant->FP_REGS.  We cannot support such operations which
9355      use SP as source and an FP_REG as destination, so reject out
9356      right now.  */
9357   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
9358     {
9359       rtx lhs = XEXP (x, 0);
9360
9361       /* Look through a possible SUBREG introduced by ILP32.  */
9362       if (GET_CODE (lhs) == SUBREG)
9363         lhs = SUBREG_REG (lhs);
9364
9365       gcc_assert (REG_P (lhs));
9366       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
9367                                       POINTER_REGS));
9368       return NO_REGS;
9369     }
9370
9371   return regclass;
9372 }
9373
9374 void
9375 aarch64_asm_output_labelref (FILE* f, const char *name)
9376 {
9377   asm_fprintf (f, "%U%s", name);
9378 }
9379
9380 static void
9381 aarch64_elf_asm_constructor (rtx symbol, int priority)
9382 {
9383   if (priority == DEFAULT_INIT_PRIORITY)
9384     default_ctor_section_asm_out_constructor (symbol, priority);
9385   else
9386     {
9387       section *s;
9388       /* While priority is known to be in range [0, 65535], so 18 bytes
9389          would be enough, the compiler might not know that.  To avoid
9390          -Wformat-truncation false positive, use a larger size.  */
9391       char buf[23];
9392       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
9393       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9394       switch_to_section (s);
9395       assemble_align (POINTER_SIZE);
9396       assemble_aligned_integer (POINTER_BYTES, symbol);
9397     }
9398 }
9399
9400 static void
9401 aarch64_elf_asm_destructor (rtx symbol, int priority)
9402 {
9403   if (priority == DEFAULT_INIT_PRIORITY)
9404     default_dtor_section_asm_out_destructor (symbol, priority);
9405   else
9406     {
9407       section *s;
9408       /* While priority is known to be in range [0, 65535], so 18 bytes
9409          would be enough, the compiler might not know that.  To avoid
9410          -Wformat-truncation false positive, use a larger size.  */
9411       char buf[23];
9412       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
9413       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9414       switch_to_section (s);
9415       assemble_align (POINTER_SIZE);
9416       assemble_aligned_integer (POINTER_BYTES, symbol);
9417     }
9418 }
9419
9420 const char*
9421 aarch64_output_casesi (rtx *operands)
9422 {
9423   char buf[100];
9424   char label[100];
9425   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
9426   int index;
9427   static const char *const patterns[4][2] =
9428   {
9429     {
9430       "ldrb\t%w3, [%0,%w1,uxtw]",
9431       "add\t%3, %4, %w3, sxtb #2"
9432     },
9433     {
9434       "ldrh\t%w3, [%0,%w1,uxtw #1]",
9435       "add\t%3, %4, %w3, sxth #2"
9436     },
9437     {
9438       "ldr\t%w3, [%0,%w1,uxtw #2]",
9439       "add\t%3, %4, %w3, sxtw #2"
9440     },
9441     /* We assume that DImode is only generated when not optimizing and
9442        that we don't really need 64-bit address offsets.  That would
9443        imply an object file with 8GB of code in a single function!  */
9444     {
9445       "ldr\t%w3, [%0,%w1,uxtw #2]",
9446       "add\t%3, %4, %w3, sxtw #2"
9447     }
9448   };
9449
9450   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
9451
9452   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
9453   index = exact_log2 (GET_MODE_SIZE (mode));
9454
9455   gcc_assert (index >= 0 && index <= 3);
9456
9457   /* Need to implement table size reduction, by chaning the code below.  */
9458   output_asm_insn (patterns[index][0], operands);
9459   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
9460   snprintf (buf, sizeof (buf),
9461             "adr\t%%4, %s", targetm.strip_name_encoding (label));
9462   output_asm_insn (buf, operands);
9463   output_asm_insn (patterns[index][1], operands);
9464   output_asm_insn ("br\t%3", operands);
9465   assemble_label (asm_out_file, label);
9466   return "";
9467 }
9468
9469
9470 /* Return size in bits of an arithmetic operand which is shifted/scaled and
9471    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
9472    operator.  */
9473
9474 int
9475 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
9476 {
9477   if (shift >= 0 && shift <= 3)
9478     {
9479       int size;
9480       for (size = 8; size <= 32; size *= 2)
9481         {
9482           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
9483           if (mask == bits << shift)
9484             return size;
9485         }
9486     }
9487   return 0;
9488 }
9489
9490 /* Constant pools are per function only when PC relative
9491    literal loads are true or we are in the large memory
9492    model.  */
9493
9494 static inline bool
9495 aarch64_can_use_per_function_literal_pools_p (void)
9496 {
9497   return (aarch64_pcrelative_literal_loads
9498           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
9499 }
9500
9501 static bool
9502 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
9503 {
9504   /* We can't use blocks for constants when we're using a per-function
9505      constant pool.  */
9506   return !aarch64_can_use_per_function_literal_pools_p ();
9507 }
9508
9509 /* Select appropriate section for constants depending
9510    on where we place literal pools.  */
9511
9512 static section *
9513 aarch64_select_rtx_section (machine_mode mode,
9514                             rtx x,
9515                             unsigned HOST_WIDE_INT align)
9516 {
9517   if (aarch64_can_use_per_function_literal_pools_p ())
9518     return function_section (current_function_decl);
9519
9520   return default_elf_select_rtx_section (mode, x, align);
9521 }
9522
9523 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
9524 void
9525 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
9526                                   HOST_WIDE_INT offset)
9527 {
9528   /* When using per-function literal pools, we must ensure that any code
9529      section is aligned to the minimal instruction length, lest we get
9530      errors from the assembler re "unaligned instructions".  */
9531   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
9532     ASM_OUTPUT_ALIGN (f, 2);
9533 }
9534
9535 /* Costs.  */
9536
9537 /* Helper function for rtx cost calculation.  Strip a shift expression
9538    from X.  Returns the inner operand if successful, or the original
9539    expression on failure.  */
9540 static rtx
9541 aarch64_strip_shift (rtx x)
9542 {
9543   rtx op = x;
9544
9545   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
9546      we can convert both to ROR during final output.  */
9547   if ((GET_CODE (op) == ASHIFT
9548        || GET_CODE (op) == ASHIFTRT
9549        || GET_CODE (op) == LSHIFTRT
9550        || GET_CODE (op) == ROTATERT
9551        || GET_CODE (op) == ROTATE)
9552       && CONST_INT_P (XEXP (op, 1)))
9553     return XEXP (op, 0);
9554
9555   if (GET_CODE (op) == MULT
9556       && CONST_INT_P (XEXP (op, 1))
9557       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
9558     return XEXP (op, 0);
9559
9560   return x;
9561 }
9562
9563 /* Helper function for rtx cost calculation.  Strip an extend
9564    expression from X.  Returns the inner operand if successful, or the
9565    original expression on failure.  We deal with a number of possible
9566    canonicalization variations here. If STRIP_SHIFT is true, then
9567    we can strip off a shift also.  */
9568 static rtx
9569 aarch64_strip_extend (rtx x, bool strip_shift)
9570 {
9571   scalar_int_mode mode;
9572   rtx op = x;
9573
9574   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
9575     return op;
9576
9577   /* Zero and sign extraction of a widened value.  */
9578   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
9579       && XEXP (op, 2) == const0_rtx
9580       && GET_CODE (XEXP (op, 0)) == MULT
9581       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
9582                                          XEXP (op, 1)))
9583     return XEXP (XEXP (op, 0), 0);
9584
9585   /* It can also be represented (for zero-extend) as an AND with an
9586      immediate.  */
9587   if (GET_CODE (op) == AND
9588       && GET_CODE (XEXP (op, 0)) == MULT
9589       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
9590       && CONST_INT_P (XEXP (op, 1))
9591       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
9592                            INTVAL (XEXP (op, 1))) != 0)
9593     return XEXP (XEXP (op, 0), 0);
9594
9595   /* Now handle extended register, as this may also have an optional
9596      left shift by 1..4.  */
9597   if (strip_shift
9598       && GET_CODE (op) == ASHIFT
9599       && CONST_INT_P (XEXP (op, 1))
9600       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
9601     op = XEXP (op, 0);
9602
9603   if (GET_CODE (op) == ZERO_EXTEND
9604       || GET_CODE (op) == SIGN_EXTEND)
9605     op = XEXP (op, 0);
9606
9607   if (op != x)
9608     return op;
9609
9610   return x;
9611 }
9612
9613 /* Return true iff CODE is a shift supported in combination
9614    with arithmetic instructions.  */
9615
9616 static bool
9617 aarch64_shift_p (enum rtx_code code)
9618 {
9619   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
9620 }
9621
9622
9623 /* Return true iff X is a cheap shift without a sign extend. */
9624
9625 static bool
9626 aarch64_cheap_mult_shift_p (rtx x)
9627 {
9628   rtx op0, op1;
9629
9630   op0 = XEXP (x, 0);
9631   op1 = XEXP (x, 1);
9632
9633   if (!(aarch64_tune_params.extra_tuning_flags
9634                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
9635     return false;
9636
9637   if (GET_CODE (op0) == SIGN_EXTEND)
9638     return false;
9639
9640   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
9641       && UINTVAL (op1) <= 4)
9642     return true;
9643
9644   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
9645     return false;
9646
9647   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
9648
9649   if (l2 > 0 && l2 <= 4)
9650     return true;
9651
9652   return false;
9653 }
9654
9655 /* Helper function for rtx cost calculation.  Calculate the cost of
9656    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
9657    Return the calculated cost of the expression, recursing manually in to
9658    operands where needed.  */
9659
9660 static int
9661 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
9662 {
9663   rtx op0, op1;
9664   const struct cpu_cost_table *extra_cost
9665     = aarch64_tune_params.insn_extra_cost;
9666   int cost = 0;
9667   bool compound_p = (outer == PLUS || outer == MINUS);
9668   machine_mode mode = GET_MODE (x);
9669
9670   gcc_checking_assert (code == MULT);
9671
9672   op0 = XEXP (x, 0);
9673   op1 = XEXP (x, 1);
9674
9675   if (VECTOR_MODE_P (mode))
9676     mode = GET_MODE_INNER (mode);
9677
9678   /* Integer multiply/fma.  */
9679   if (GET_MODE_CLASS (mode) == MODE_INT)
9680     {
9681       /* The multiply will be canonicalized as a shift, cost it as such.  */
9682       if (aarch64_shift_p (GET_CODE (x))
9683           || (CONST_INT_P (op1)
9684               && exact_log2 (INTVAL (op1)) > 0))
9685         {
9686           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
9687                            || GET_CODE (op0) == SIGN_EXTEND;
9688           if (speed)
9689             {
9690               if (compound_p)
9691                 {
9692                   /* If the shift is considered cheap,
9693                      then don't add any cost. */
9694                   if (aarch64_cheap_mult_shift_p (x))
9695                     ;
9696                   else if (REG_P (op1))
9697                     /* ARITH + shift-by-register.  */
9698                     cost += extra_cost->alu.arith_shift_reg;
9699                   else if (is_extend)
9700                     /* ARITH + extended register.  We don't have a cost field
9701                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
9702                     cost += extra_cost->alu.extend_arith;
9703                   else
9704                     /* ARITH + shift-by-immediate.  */
9705                     cost += extra_cost->alu.arith_shift;
9706                 }
9707               else
9708                 /* LSL (immediate).  */
9709                 cost += extra_cost->alu.shift;
9710
9711             }
9712           /* Strip extends as we will have costed them in the case above.  */
9713           if (is_extend)
9714             op0 = aarch64_strip_extend (op0, true);
9715
9716           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
9717
9718           return cost;
9719         }
9720
9721       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
9722          compound and let the below cases handle it.  After all, MNEG is a
9723          special-case alias of MSUB.  */
9724       if (GET_CODE (op0) == NEG)
9725         {
9726           op0 = XEXP (op0, 0);
9727           compound_p = true;
9728         }
9729
9730       /* Integer multiplies or FMAs have zero/sign extending variants.  */
9731       if ((GET_CODE (op0) == ZERO_EXTEND
9732            && GET_CODE (op1) == ZERO_EXTEND)
9733           || (GET_CODE (op0) == SIGN_EXTEND
9734               && GET_CODE (op1) == SIGN_EXTEND))
9735         {
9736           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
9737           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
9738
9739           if (speed)
9740             {
9741               if (compound_p)
9742                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
9743                 cost += extra_cost->mult[0].extend_add;
9744               else
9745                 /* MUL/SMULL/UMULL.  */
9746                 cost += extra_cost->mult[0].extend;
9747             }
9748
9749           return cost;
9750         }
9751
9752       /* This is either an integer multiply or a MADD.  In both cases
9753          we want to recurse and cost the operands.  */
9754       cost += rtx_cost (op0, mode, MULT, 0, speed);
9755       cost += rtx_cost (op1, mode, MULT, 1, speed);
9756
9757       if (speed)
9758         {
9759           if (compound_p)
9760             /* MADD/MSUB.  */
9761             cost += extra_cost->mult[mode == DImode].add;
9762           else
9763             /* MUL.  */
9764             cost += extra_cost->mult[mode == DImode].simple;
9765         }
9766
9767       return cost;
9768     }
9769   else
9770     {
9771       if (speed)
9772         {
9773           /* Floating-point FMA/FMUL can also support negations of the
9774              operands, unless the rounding mode is upward or downward in
9775              which case FNMUL is different than FMUL with operand negation.  */
9776           bool neg0 = GET_CODE (op0) == NEG;
9777           bool neg1 = GET_CODE (op1) == NEG;
9778           if (compound_p || !flag_rounding_math || (neg0 && neg1))
9779             {
9780               if (neg0)
9781                 op0 = XEXP (op0, 0);
9782               if (neg1)
9783                 op1 = XEXP (op1, 0);
9784             }
9785
9786           if (compound_p)
9787             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
9788             cost += extra_cost->fp[mode == DFmode].fma;
9789           else
9790             /* FMUL/FNMUL.  */
9791             cost += extra_cost->fp[mode == DFmode].mult;
9792         }
9793
9794       cost += rtx_cost (op0, mode, MULT, 0, speed);
9795       cost += rtx_cost (op1, mode, MULT, 1, speed);
9796       return cost;
9797     }
9798 }
9799
9800 static int
9801 aarch64_address_cost (rtx x,
9802                       machine_mode mode,
9803                       addr_space_t as ATTRIBUTE_UNUSED,
9804                       bool speed)
9805 {
9806   enum rtx_code c = GET_CODE (x);
9807   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
9808   struct aarch64_address_info info;
9809   int cost = 0;
9810   info.shift = 0;
9811
9812   if (!aarch64_classify_address (&info, x, mode, false))
9813     {
9814       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
9815         {
9816           /* This is a CONST or SYMBOL ref which will be split
9817              in a different way depending on the code model in use.
9818              Cost it through the generic infrastructure.  */
9819           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
9820           /* Divide through by the cost of one instruction to
9821              bring it to the same units as the address costs.  */
9822           cost_symbol_ref /= COSTS_N_INSNS (1);
9823           /* The cost is then the cost of preparing the address,
9824              followed by an immediate (possibly 0) offset.  */
9825           return cost_symbol_ref + addr_cost->imm_offset;
9826         }
9827       else
9828         {
9829           /* This is most likely a jump table from a case
9830              statement.  */
9831           return addr_cost->register_offset;
9832         }
9833     }
9834
9835   switch (info.type)
9836     {
9837       case ADDRESS_LO_SUM:
9838       case ADDRESS_SYMBOLIC:
9839       case ADDRESS_REG_IMM:
9840         cost += addr_cost->imm_offset;
9841         break;
9842
9843       case ADDRESS_REG_WB:
9844         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
9845           cost += addr_cost->pre_modify;
9846         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
9847           cost += addr_cost->post_modify;
9848         else
9849           gcc_unreachable ();
9850
9851         break;
9852
9853       case ADDRESS_REG_REG:
9854         cost += addr_cost->register_offset;
9855         break;
9856
9857       case ADDRESS_REG_SXTW:
9858         cost += addr_cost->register_sextend;
9859         break;
9860
9861       case ADDRESS_REG_UXTW:
9862         cost += addr_cost->register_zextend;
9863         break;
9864
9865       default:
9866         gcc_unreachable ();
9867     }
9868
9869
9870   if (info.shift > 0)
9871     {
9872       /* For the sake of calculating the cost of the shifted register
9873          component, we can treat same sized modes in the same way.  */
9874       if (known_eq (GET_MODE_BITSIZE (mode), 16))
9875         cost += addr_cost->addr_scale_costs.hi;
9876       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9877         cost += addr_cost->addr_scale_costs.si;
9878       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9879         cost += addr_cost->addr_scale_costs.di;
9880       else
9881         /* We can't tell, or this is a 128-bit vector.  */
9882         cost += addr_cost->addr_scale_costs.ti;
9883     }
9884
9885   return cost;
9886 }
9887
9888 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
9889    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
9890    to be taken.  */
9891
9892 int
9893 aarch64_branch_cost (bool speed_p, bool predictable_p)
9894 {
9895   /* When optimizing for speed, use the cost of unpredictable branches.  */
9896   const struct cpu_branch_cost *branch_costs =
9897     aarch64_tune_params.branch_costs;
9898
9899   if (!speed_p || predictable_p)
9900     return branch_costs->predictable;
9901   else
9902     return branch_costs->unpredictable;
9903 }
9904
9905 /* Return true if the RTX X in mode MODE is a zero or sign extract
9906    usable in an ADD or SUB (extended register) instruction.  */
9907 static bool
9908 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
9909 {
9910   /* Catch add with a sign extract.
9911      This is add_<optab><mode>_multp2.  */
9912   if (GET_CODE (x) == SIGN_EXTRACT
9913       || GET_CODE (x) == ZERO_EXTRACT)
9914     {
9915       rtx op0 = XEXP (x, 0);
9916       rtx op1 = XEXP (x, 1);
9917       rtx op2 = XEXP (x, 2);
9918
9919       if (GET_CODE (op0) == MULT
9920           && CONST_INT_P (op1)
9921           && op2 == const0_rtx
9922           && CONST_INT_P (XEXP (op0, 1))
9923           && aarch64_is_extend_from_extract (mode,
9924                                              XEXP (op0, 1),
9925                                              op1))
9926         {
9927           return true;
9928         }
9929     }
9930   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
9931      No shift.  */
9932   else if (GET_CODE (x) == SIGN_EXTEND
9933            || GET_CODE (x) == ZERO_EXTEND)
9934     return REG_P (XEXP (x, 0));
9935
9936   return false;
9937 }
9938
9939 static bool
9940 aarch64_frint_unspec_p (unsigned int u)
9941 {
9942   switch (u)
9943     {
9944       case UNSPEC_FRINTZ:
9945       case UNSPEC_FRINTP:
9946       case UNSPEC_FRINTM:
9947       case UNSPEC_FRINTA:
9948       case UNSPEC_FRINTN:
9949       case UNSPEC_FRINTX:
9950       case UNSPEC_FRINTI:
9951         return true;
9952
9953       default:
9954         return false;
9955     }
9956 }
9957
9958 /* Return true iff X is an rtx that will match an extr instruction
9959    i.e. as described in the *extr<mode>5_insn family of patterns.
9960    OP0 and OP1 will be set to the operands of the shifts involved
9961    on success and will be NULL_RTX otherwise.  */
9962
9963 static bool
9964 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
9965 {
9966   rtx op0, op1;
9967   scalar_int_mode mode;
9968   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
9969     return false;
9970
9971   *res_op0 = NULL_RTX;
9972   *res_op1 = NULL_RTX;
9973
9974   if (GET_CODE (x) != IOR)
9975     return false;
9976
9977   op0 = XEXP (x, 0);
9978   op1 = XEXP (x, 1);
9979
9980   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
9981       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
9982     {
9983      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
9984       if (GET_CODE (op1) == ASHIFT)
9985         std::swap (op0, op1);
9986
9987       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
9988         return false;
9989
9990       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
9991       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
9992
9993       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
9994           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
9995         {
9996           *res_op0 = XEXP (op0, 0);
9997           *res_op1 = XEXP (op1, 0);
9998           return true;
9999         }
10000     }
10001
10002   return false;
10003 }
10004
10005 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
10006    storing it in *COST.  Result is true if the total cost of the operation
10007    has now been calculated.  */
10008 static bool
10009 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
10010 {
10011   rtx inner;
10012   rtx comparator;
10013   enum rtx_code cmpcode;
10014
10015   if (COMPARISON_P (op0))
10016     {
10017       inner = XEXP (op0, 0);
10018       comparator = XEXP (op0, 1);
10019       cmpcode = GET_CODE (op0);
10020     }
10021   else
10022     {
10023       inner = op0;
10024       comparator = const0_rtx;
10025       cmpcode = NE;
10026     }
10027
10028   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
10029     {
10030       /* Conditional branch.  */
10031       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10032         return true;
10033       else
10034         {
10035           if (cmpcode == NE || cmpcode == EQ)
10036             {
10037               if (comparator == const0_rtx)
10038                 {
10039                   /* TBZ/TBNZ/CBZ/CBNZ.  */
10040                   if (GET_CODE (inner) == ZERO_EXTRACT)
10041                     /* TBZ/TBNZ.  */
10042                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
10043                                        ZERO_EXTRACT, 0, speed);
10044                   else
10045                     /* CBZ/CBNZ.  */
10046                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
10047
10048                 return true;
10049               }
10050             }
10051           else if (cmpcode == LT || cmpcode == GE)
10052             {
10053               /* TBZ/TBNZ.  */
10054               if (comparator == const0_rtx)
10055                 return true;
10056             }
10057         }
10058     }
10059   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10060     {
10061       /* CCMP.  */
10062       if (GET_CODE (op1) == COMPARE)
10063         {
10064           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
10065           if (XEXP (op1, 1) == const0_rtx)
10066             *cost += 1;
10067           if (speed)
10068             {
10069               machine_mode mode = GET_MODE (XEXP (op1, 0));
10070               const struct cpu_cost_table *extra_cost
10071                 = aarch64_tune_params.insn_extra_cost;
10072
10073               if (GET_MODE_CLASS (mode) == MODE_INT)
10074                 *cost += extra_cost->alu.arith;
10075               else
10076                 *cost += extra_cost->fp[mode == DFmode].compare;
10077             }
10078           return true;
10079         }
10080
10081       /* It's a conditional operation based on the status flags,
10082          so it must be some flavor of CSEL.  */
10083
10084       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
10085       if (GET_CODE (op1) == NEG
10086           || GET_CODE (op1) == NOT
10087           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
10088         op1 = XEXP (op1, 0);
10089       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
10090         {
10091           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
10092           op1 = XEXP (op1, 0);
10093           op2 = XEXP (op2, 0);
10094         }
10095
10096       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
10097       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
10098       return true;
10099     }
10100
10101   /* We don't know what this is, cost all operands.  */
10102   return false;
10103 }
10104
10105 /* Check whether X is a bitfield operation of the form shift + extend that
10106    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
10107    operand to which the bitfield operation is applied.  Otherwise return
10108    NULL_RTX.  */
10109
10110 static rtx
10111 aarch64_extend_bitfield_pattern_p (rtx x)
10112 {
10113   rtx_code outer_code = GET_CODE (x);
10114   machine_mode outer_mode = GET_MODE (x);
10115
10116   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
10117       && outer_mode != SImode && outer_mode != DImode)
10118     return NULL_RTX;
10119
10120   rtx inner = XEXP (x, 0);
10121   rtx_code inner_code = GET_CODE (inner);
10122   machine_mode inner_mode = GET_MODE (inner);
10123   rtx op = NULL_RTX;
10124
10125   switch (inner_code)
10126     {
10127       case ASHIFT:
10128         if (CONST_INT_P (XEXP (inner, 1))
10129             && (inner_mode == QImode || inner_mode == HImode))
10130           op = XEXP (inner, 0);
10131         break;
10132       case LSHIFTRT:
10133         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
10134             && (inner_mode == QImode || inner_mode == HImode))
10135           op = XEXP (inner, 0);
10136         break;
10137       case ASHIFTRT:
10138         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
10139             && (inner_mode == QImode || inner_mode == HImode))
10140           op = XEXP (inner, 0);
10141         break;
10142       default:
10143         break;
10144     }
10145
10146   return op;
10147 }
10148
10149 /* Return true if the mask and a shift amount from an RTX of the form
10150    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
10151    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
10152
10153 bool
10154 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
10155                                     rtx shft_amnt)
10156 {
10157   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
10158          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
10159          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
10160          && (INTVAL (mask)
10161              & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
10162 }
10163
10164 /* Return true if the masks and a shift amount from an RTX of the form
10165    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
10166    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
10167
10168 bool
10169 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
10170                                    unsigned HOST_WIDE_INT mask1,
10171                                    unsigned HOST_WIDE_INT shft_amnt,
10172                                    unsigned HOST_WIDE_INT mask2)
10173 {
10174   unsigned HOST_WIDE_INT t;
10175
10176   /* Verify that there is no overlap in what bits are set in the two masks.  */
10177   if (mask1 != ~mask2)
10178     return false;
10179
10180   /* Verify that mask2 is not all zeros or ones.  */
10181   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
10182     return false;
10183
10184   /* The shift amount should always be less than the mode size.  */
10185   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
10186
10187   /* Verify that the mask being shifted is contiguous and would be in the
10188      least significant bits after shifting by shft_amnt.  */
10189   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
10190   return (t == (t & -t));
10191 }
10192
10193 /* Calculate the cost of calculating X, storing it in *COST.  Result
10194    is true if the total cost of the operation has now been calculated.  */
10195 static bool
10196 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
10197                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
10198 {
10199   rtx op0, op1, op2;
10200   const struct cpu_cost_table *extra_cost
10201     = aarch64_tune_params.insn_extra_cost;
10202   int code = GET_CODE (x);
10203   scalar_int_mode int_mode;
10204
10205   /* By default, assume that everything has equivalent cost to the
10206      cheapest instruction.  Any additional costs are applied as a delta
10207      above this default.  */
10208   *cost = COSTS_N_INSNS (1);
10209
10210   switch (code)
10211     {
10212     case SET:
10213       /* The cost depends entirely on the operands to SET.  */
10214       *cost = 0;
10215       op0 = SET_DEST (x);
10216       op1 = SET_SRC (x);
10217
10218       switch (GET_CODE (op0))
10219         {
10220         case MEM:
10221           if (speed)
10222             {
10223               rtx address = XEXP (op0, 0);
10224               if (VECTOR_MODE_P (mode))
10225                 *cost += extra_cost->ldst.storev;
10226               else if (GET_MODE_CLASS (mode) == MODE_INT)
10227                 *cost += extra_cost->ldst.store;
10228               else if (mode == SFmode)
10229                 *cost += extra_cost->ldst.storef;
10230               else if (mode == DFmode)
10231                 *cost += extra_cost->ldst.stored;
10232
10233               *cost +=
10234                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10235                                                      0, speed));
10236             }
10237
10238           *cost += rtx_cost (op1, mode, SET, 1, speed);
10239           return true;
10240
10241         case SUBREG:
10242           if (! REG_P (SUBREG_REG (op0)))
10243             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
10244
10245           /* Fall through.  */
10246         case REG:
10247           /* The cost is one per vector-register copied.  */
10248           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
10249             {
10250               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
10251               *cost = COSTS_N_INSNS (nregs);
10252             }
10253           /* const0_rtx is in general free, but we will use an
10254              instruction to set a register to 0.  */
10255           else if (REG_P (op1) || op1 == const0_rtx)
10256             {
10257               /* The cost is 1 per register copied.  */
10258               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
10259               *cost = COSTS_N_INSNS (nregs);
10260             }
10261           else
10262             /* Cost is just the cost of the RHS of the set.  */
10263             *cost += rtx_cost (op1, mode, SET, 1, speed);
10264           return true;
10265
10266         case ZERO_EXTRACT:
10267         case SIGN_EXTRACT:
10268           /* Bit-field insertion.  Strip any redundant widening of
10269              the RHS to meet the width of the target.  */
10270           if (GET_CODE (op1) == SUBREG)
10271             op1 = SUBREG_REG (op1);
10272           if ((GET_CODE (op1) == ZERO_EXTEND
10273                || GET_CODE (op1) == SIGN_EXTEND)
10274               && CONST_INT_P (XEXP (op0, 1))
10275               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
10276               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
10277             op1 = XEXP (op1, 0);
10278
10279           if (CONST_INT_P (op1))
10280             {
10281               /* MOV immediate is assumed to always be cheap.  */
10282               *cost = COSTS_N_INSNS (1);
10283             }
10284           else
10285             {
10286               /* BFM.  */
10287               if (speed)
10288                 *cost += extra_cost->alu.bfi;
10289               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
10290             }
10291
10292           return true;
10293
10294         default:
10295           /* We can't make sense of this, assume default cost.  */
10296           *cost = COSTS_N_INSNS (1);
10297           return false;
10298         }
10299       return false;
10300
10301     case CONST_INT:
10302       /* If an instruction can incorporate a constant within the
10303          instruction, the instruction's expression avoids calling
10304          rtx_cost() on the constant.  If rtx_cost() is called on a
10305          constant, then it is usually because the constant must be
10306          moved into a register by one or more instructions.
10307
10308          The exception is constant 0, which can be expressed
10309          as XZR/WZR and is therefore free.  The exception to this is
10310          if we have (set (reg) (const0_rtx)) in which case we must cost
10311          the move.  However, we can catch that when we cost the SET, so
10312          we don't need to consider that here.  */
10313       if (x == const0_rtx)
10314         *cost = 0;
10315       else
10316         {
10317           /* To an approximation, building any other constant is
10318              proportionally expensive to the number of instructions
10319              required to build that constant.  This is true whether we
10320              are compiling for SPEED or otherwise.  */
10321           if (!is_a <scalar_int_mode> (mode, &int_mode))
10322             int_mode = word_mode;
10323           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
10324                                  (NULL_RTX, x, false, int_mode));
10325         }
10326       return true;
10327
10328     case CONST_DOUBLE:
10329
10330       /* First determine number of instructions to do the move
10331           as an integer constant.  */
10332       if (!aarch64_float_const_representable_p (x)
10333            && !aarch64_can_const_movi_rtx_p (x, mode)
10334            && aarch64_float_const_rtx_p (x))
10335         {
10336           unsigned HOST_WIDE_INT ival;
10337           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
10338           gcc_assert (succeed);
10339
10340           scalar_int_mode imode = (mode == HFmode
10341                                    ? SImode
10342                                    : int_mode_for_mode (mode).require ());
10343           int ncost = aarch64_internal_mov_immediate
10344                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
10345           *cost += COSTS_N_INSNS (ncost);
10346           return true;
10347         }
10348
10349       if (speed)
10350         {
10351           /* mov[df,sf]_aarch64.  */
10352           if (aarch64_float_const_representable_p (x))
10353             /* FMOV (scalar immediate).  */
10354             *cost += extra_cost->fp[mode == DFmode].fpconst;
10355           else if (!aarch64_float_const_zero_rtx_p (x))
10356             {
10357               /* This will be a load from memory.  */
10358               if (mode == DFmode)
10359                 *cost += extra_cost->ldst.loadd;
10360               else
10361                 *cost += extra_cost->ldst.loadf;
10362             }
10363           else
10364             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
10365                or MOV v0.s[0], wzr - neither of which are modeled by the
10366                cost tables.  Just use the default cost.  */
10367             {
10368             }
10369         }
10370
10371       return true;
10372
10373     case MEM:
10374       if (speed)
10375         {
10376           /* For loads we want the base cost of a load, plus an
10377              approximation for the additional cost of the addressing
10378              mode.  */
10379           rtx address = XEXP (x, 0);
10380           if (VECTOR_MODE_P (mode))
10381             *cost += extra_cost->ldst.loadv;
10382           else if (GET_MODE_CLASS (mode) == MODE_INT)
10383             *cost += extra_cost->ldst.load;
10384           else if (mode == SFmode)
10385             *cost += extra_cost->ldst.loadf;
10386           else if (mode == DFmode)
10387             *cost += extra_cost->ldst.loadd;
10388
10389           *cost +=
10390                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10391                                                      0, speed));
10392         }
10393
10394       return true;
10395
10396     case NEG:
10397       op0 = XEXP (x, 0);
10398
10399       if (VECTOR_MODE_P (mode))
10400         {
10401           if (speed)
10402             {
10403               /* FNEG.  */
10404               *cost += extra_cost->vect.alu;
10405             }
10406           return false;
10407         }
10408
10409       if (GET_MODE_CLASS (mode) == MODE_INT)
10410         {
10411           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10412               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10413             {
10414               /* CSETM.  */
10415               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
10416               return true;
10417             }
10418
10419           /* Cost this as SUB wzr, X.  */
10420           op0 = CONST0_RTX (mode);
10421           op1 = XEXP (x, 0);
10422           goto cost_minus;
10423         }
10424
10425       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10426         {
10427           /* Support (neg(fma...)) as a single instruction only if
10428              sign of zeros is unimportant.  This matches the decision
10429              making in aarch64.md.  */
10430           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
10431             {
10432               /* FNMADD.  */
10433               *cost = rtx_cost (op0, mode, NEG, 0, speed);
10434               return true;
10435             }
10436           if (GET_CODE (op0) == MULT)
10437             {
10438               /* FNMUL.  */
10439               *cost = rtx_cost (op0, mode, NEG, 0, speed);
10440               return true;
10441             }
10442           if (speed)
10443             /* FNEG.  */
10444             *cost += extra_cost->fp[mode == DFmode].neg;
10445           return false;
10446         }
10447
10448       return false;
10449
10450     case CLRSB:
10451     case CLZ:
10452       if (speed)
10453         {
10454           if (VECTOR_MODE_P (mode))
10455             *cost += extra_cost->vect.alu;
10456           else
10457             *cost += extra_cost->alu.clz;
10458         }
10459
10460       return false;
10461
10462     case COMPARE:
10463       op0 = XEXP (x, 0);
10464       op1 = XEXP (x, 1);
10465
10466       if (op1 == const0_rtx
10467           && GET_CODE (op0) == AND)
10468         {
10469           x = op0;
10470           mode = GET_MODE (op0);
10471           goto cost_logic;
10472         }
10473
10474       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
10475         {
10476           /* TODO: A write to the CC flags possibly costs extra, this
10477              needs encoding in the cost tables.  */
10478
10479           mode = GET_MODE (op0);
10480           /* ANDS.  */
10481           if (GET_CODE (op0) == AND)
10482             {
10483               x = op0;
10484               goto cost_logic;
10485             }
10486
10487           if (GET_CODE (op0) == PLUS)
10488             {
10489               /* ADDS (and CMN alias).  */
10490               x = op0;
10491               goto cost_plus;
10492             }
10493
10494           if (GET_CODE (op0) == MINUS)
10495             {
10496               /* SUBS.  */
10497               x = op0;
10498               goto cost_minus;
10499             }
10500
10501           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
10502               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
10503               && CONST_INT_P (XEXP (op0, 2)))
10504             {
10505               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
10506                  Handle it here directly rather than going to cost_logic
10507                  since we know the immediate generated for the TST is valid
10508                  so we can avoid creating an intermediate rtx for it only
10509                  for costing purposes.  */
10510               if (speed)
10511                 *cost += extra_cost->alu.logical;
10512
10513               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
10514                                  ZERO_EXTRACT, 0, speed);
10515               return true;
10516             }
10517
10518           if (GET_CODE (op1) == NEG)
10519             {
10520               /* CMN.  */
10521               if (speed)
10522                 *cost += extra_cost->alu.arith;
10523
10524               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
10525               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
10526               return true;
10527             }
10528
10529           /* CMP.
10530
10531              Compare can freely swap the order of operands, and
10532              canonicalization puts the more complex operation first.
10533              But the integer MINUS logic expects the shift/extend
10534              operation in op1.  */
10535           if (! (REG_P (op0)
10536                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
10537           {
10538             op0 = XEXP (x, 1);
10539             op1 = XEXP (x, 0);
10540           }
10541           goto cost_minus;
10542         }
10543
10544       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
10545         {
10546           /* FCMP.  */
10547           if (speed)
10548             *cost += extra_cost->fp[mode == DFmode].compare;
10549
10550           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
10551             {
10552               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
10553               /* FCMP supports constant 0.0 for no extra cost. */
10554               return true;
10555             }
10556           return false;
10557         }
10558
10559       if (VECTOR_MODE_P (mode))
10560         {
10561           /* Vector compare.  */
10562           if (speed)
10563             *cost += extra_cost->vect.alu;
10564
10565           if (aarch64_float_const_zero_rtx_p (op1))
10566             {
10567               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
10568                  cost.  */
10569               return true;
10570             }
10571           return false;
10572         }
10573       return false;
10574
10575     case MINUS:
10576       {
10577         op0 = XEXP (x, 0);
10578         op1 = XEXP (x, 1);
10579
10580 cost_minus:
10581         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
10582
10583         /* Detect valid immediates.  */
10584         if ((GET_MODE_CLASS (mode) == MODE_INT
10585              || (GET_MODE_CLASS (mode) == MODE_CC
10586                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
10587             && CONST_INT_P (op1)
10588             && aarch64_uimm12_shift (INTVAL (op1)))
10589           {
10590             if (speed)
10591               /* SUB(S) (immediate).  */
10592               *cost += extra_cost->alu.arith;
10593             return true;
10594           }
10595
10596         /* Look for SUB (extended register).  */
10597         if (is_a <scalar_int_mode> (mode, &int_mode)
10598             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
10599           {
10600             if (speed)
10601               *cost += extra_cost->alu.extend_arith;
10602
10603             op1 = aarch64_strip_extend (op1, true);
10604             *cost += rtx_cost (op1, VOIDmode,
10605                                (enum rtx_code) GET_CODE (op1), 0, speed);
10606             return true;
10607           }
10608
10609         rtx new_op1 = aarch64_strip_extend (op1, false);
10610
10611         /* Cost this as an FMA-alike operation.  */
10612         if ((GET_CODE (new_op1) == MULT
10613              || aarch64_shift_p (GET_CODE (new_op1)))
10614             && code != COMPARE)
10615           {
10616             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
10617                                             (enum rtx_code) code,
10618                                             speed);
10619             return true;
10620           }
10621
10622         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
10623
10624         if (speed)
10625           {
10626             if (VECTOR_MODE_P (mode))
10627               {
10628                 /* Vector SUB.  */
10629                 *cost += extra_cost->vect.alu;
10630               }
10631             else if (GET_MODE_CLASS (mode) == MODE_INT)
10632               {
10633                 /* SUB(S).  */
10634                 *cost += extra_cost->alu.arith;
10635               }
10636             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10637               {
10638                 /* FSUB.  */
10639                 *cost += extra_cost->fp[mode == DFmode].addsub;
10640               }
10641           }
10642         return true;
10643       }
10644
10645     case PLUS:
10646       {
10647         rtx new_op0;
10648
10649         op0 = XEXP (x, 0);
10650         op1 = XEXP (x, 1);
10651
10652 cost_plus:
10653         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10654             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10655           {
10656             /* CSINC.  */
10657             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
10658             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10659             return true;
10660           }
10661
10662         if (GET_MODE_CLASS (mode) == MODE_INT
10663             && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
10664                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
10665           {
10666             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
10667
10668             if (speed)
10669               /* ADD (immediate).  */
10670               *cost += extra_cost->alu.arith;
10671             return true;
10672           }
10673
10674         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10675
10676         /* Look for ADD (extended register).  */
10677         if (is_a <scalar_int_mode> (mode, &int_mode)
10678             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
10679           {
10680             if (speed)
10681               *cost += extra_cost->alu.extend_arith;
10682
10683             op0 = aarch64_strip_extend (op0, true);
10684             *cost += rtx_cost (op0, VOIDmode,
10685                                (enum rtx_code) GET_CODE (op0), 0, speed);
10686             return true;
10687           }
10688
10689         /* Strip any extend, leave shifts behind as we will
10690            cost them through mult_cost.  */
10691         new_op0 = aarch64_strip_extend (op0, false);
10692
10693         if (GET_CODE (new_op0) == MULT
10694             || aarch64_shift_p (GET_CODE (new_op0)))
10695           {
10696             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
10697                                             speed);
10698             return true;
10699           }
10700
10701         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
10702
10703         if (speed)
10704           {
10705             if (VECTOR_MODE_P (mode))
10706               {
10707                 /* Vector ADD.  */
10708                 *cost += extra_cost->vect.alu;
10709               }
10710             else if (GET_MODE_CLASS (mode) == MODE_INT)
10711               {
10712                 /* ADD.  */
10713                 *cost += extra_cost->alu.arith;
10714               }
10715             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10716               {
10717                 /* FADD.  */
10718                 *cost += extra_cost->fp[mode == DFmode].addsub;
10719               }
10720           }
10721         return true;
10722       }
10723
10724     case BSWAP:
10725       *cost = COSTS_N_INSNS (1);
10726
10727       if (speed)
10728         {
10729           if (VECTOR_MODE_P (mode))
10730             *cost += extra_cost->vect.alu;
10731           else
10732             *cost += extra_cost->alu.rev;
10733         }
10734       return false;
10735
10736     case IOR:
10737       if (aarch_rev16_p (x))
10738         {
10739           *cost = COSTS_N_INSNS (1);
10740
10741           if (speed)
10742             {
10743               if (VECTOR_MODE_P (mode))
10744                 *cost += extra_cost->vect.alu;
10745               else
10746                 *cost += extra_cost->alu.rev;
10747             }
10748           return true;
10749         }
10750
10751       if (aarch64_extr_rtx_p (x, &op0, &op1))
10752         {
10753           *cost += rtx_cost (op0, mode, IOR, 0, speed);
10754           *cost += rtx_cost (op1, mode, IOR, 1, speed);
10755           if (speed)
10756             *cost += extra_cost->alu.shift;
10757
10758           return true;
10759         }
10760     /* Fall through.  */
10761     case XOR:
10762     case AND:
10763     cost_logic:
10764       op0 = XEXP (x, 0);
10765       op1 = XEXP (x, 1);
10766
10767       if (VECTOR_MODE_P (mode))
10768         {
10769           if (speed)
10770             *cost += extra_cost->vect.alu;
10771           return true;
10772         }
10773
10774       if (code == AND
10775           && GET_CODE (op0) == MULT
10776           && CONST_INT_P (XEXP (op0, 1))
10777           && CONST_INT_P (op1)
10778           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
10779                                INTVAL (op1)) != 0)
10780         {
10781           /* This is a UBFM/SBFM.  */
10782           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
10783           if (speed)
10784             *cost += extra_cost->alu.bfx;
10785           return true;
10786         }
10787
10788       if (is_int_mode (mode, &int_mode))
10789         {
10790           if (CONST_INT_P (op1))
10791             {
10792               /* We have a mask + shift version of a UBFIZ
10793                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
10794               if (GET_CODE (op0) == ASHIFT
10795                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
10796                                                          XEXP (op0, 1)))
10797                 {
10798                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
10799                                      (enum rtx_code) code, 0, speed);
10800                   if (speed)
10801                     *cost += extra_cost->alu.bfx;
10802
10803                   return true;
10804                 }
10805               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
10806                 {
10807                 /* We possibly get the immediate for free, this is not
10808                    modelled.  */
10809                   *cost += rtx_cost (op0, int_mode,
10810                                      (enum rtx_code) code, 0, speed);
10811                   if (speed)
10812                     *cost += extra_cost->alu.logical;
10813
10814                   return true;
10815                 }
10816             }
10817           else
10818             {
10819               rtx new_op0 = op0;
10820
10821               /* Handle ORN, EON, or BIC.  */
10822               if (GET_CODE (op0) == NOT)
10823                 op0 = XEXP (op0, 0);
10824
10825               new_op0 = aarch64_strip_shift (op0);
10826
10827               /* If we had a shift on op0 then this is a logical-shift-
10828                  by-register/immediate operation.  Otherwise, this is just
10829                  a logical operation.  */
10830               if (speed)
10831                 {
10832                   if (new_op0 != op0)
10833                     {
10834                       /* Shift by immediate.  */
10835                       if (CONST_INT_P (XEXP (op0, 1)))
10836                         *cost += extra_cost->alu.log_shift;
10837                       else
10838                         *cost += extra_cost->alu.log_shift_reg;
10839                     }
10840                   else
10841                     *cost += extra_cost->alu.logical;
10842                 }
10843
10844               /* In both cases we want to cost both operands.  */
10845               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
10846                                  0, speed);
10847               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
10848                                  1, speed);
10849
10850               return true;
10851             }
10852         }
10853       return false;
10854
10855     case NOT:
10856       x = XEXP (x, 0);
10857       op0 = aarch64_strip_shift (x);
10858
10859       if (VECTOR_MODE_P (mode))
10860         {
10861           /* Vector NOT.  */
10862           *cost += extra_cost->vect.alu;
10863           return false;
10864         }
10865
10866       /* MVN-shifted-reg.  */
10867       if (op0 != x)
10868         {
10869           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10870
10871           if (speed)
10872             *cost += extra_cost->alu.log_shift;
10873
10874           return true;
10875         }
10876       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10877          Handle the second form here taking care that 'a' in the above can
10878          be a shift.  */
10879       else if (GET_CODE (op0) == XOR)
10880         {
10881           rtx newop0 = XEXP (op0, 0);
10882           rtx newop1 = XEXP (op0, 1);
10883           rtx op0_stripped = aarch64_strip_shift (newop0);
10884
10885           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
10886           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
10887
10888           if (speed)
10889             {
10890               if (op0_stripped != newop0)
10891                 *cost += extra_cost->alu.log_shift;
10892               else
10893                 *cost += extra_cost->alu.logical;
10894             }
10895
10896           return true;
10897         }
10898       /* MVN.  */
10899       if (speed)
10900         *cost += extra_cost->alu.logical;
10901
10902       return false;
10903
10904     case ZERO_EXTEND:
10905
10906       op0 = XEXP (x, 0);
10907       /* If a value is written in SI mode, then zero extended to DI
10908          mode, the operation will in general be free as a write to
10909          a 'w' register implicitly zeroes the upper bits of an 'x'
10910          register.  However, if this is
10911
10912            (set (reg) (zero_extend (reg)))
10913
10914          we must cost the explicit register move.  */
10915       if (mode == DImode
10916           && GET_MODE (op0) == SImode
10917           && outer == SET)
10918         {
10919           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
10920
10921         /* If OP_COST is non-zero, then the cost of the zero extend
10922            is effectively the cost of the inner operation.  Otherwise
10923            we have a MOV instruction and we take the cost from the MOV
10924            itself.  This is true independently of whether we are
10925            optimizing for space or time.  */
10926           if (op_cost)
10927             *cost = op_cost;
10928
10929           return true;
10930         }
10931       else if (MEM_P (op0))
10932         {
10933           /* All loads can zero extend to any size for free.  */
10934           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
10935           return true;
10936         }
10937
10938       op0 = aarch64_extend_bitfield_pattern_p (x);
10939       if (op0)
10940         {
10941           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
10942           if (speed)
10943             *cost += extra_cost->alu.bfx;
10944           return true;
10945         }
10946
10947       if (speed)
10948         {
10949           if (VECTOR_MODE_P (mode))
10950             {
10951               /* UMOV.  */
10952               *cost += extra_cost->vect.alu;
10953             }
10954           else
10955             {
10956               /* We generate an AND instead of UXTB/UXTH.  */
10957               *cost += extra_cost->alu.logical;
10958             }
10959         }
10960       return false;
10961
10962     case SIGN_EXTEND:
10963       if (MEM_P (XEXP (x, 0)))
10964         {
10965           /* LDRSH.  */
10966           if (speed)
10967             {
10968               rtx address = XEXP (XEXP (x, 0), 0);
10969               *cost += extra_cost->ldst.load_sign_extend;
10970
10971               *cost +=
10972                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10973                                                      0, speed));
10974             }
10975           return true;
10976         }
10977
10978       op0 = aarch64_extend_bitfield_pattern_p (x);
10979       if (op0)
10980         {
10981           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
10982           if (speed)
10983             *cost += extra_cost->alu.bfx;
10984           return true;
10985         }
10986
10987       if (speed)
10988         {
10989           if (VECTOR_MODE_P (mode))
10990             *cost += extra_cost->vect.alu;
10991           else
10992             *cost += extra_cost->alu.extend;
10993         }
10994       return false;
10995
10996     case ASHIFT:
10997       op0 = XEXP (x, 0);
10998       op1 = XEXP (x, 1);
10999
11000       if (CONST_INT_P (op1))
11001         {
11002           if (speed)
11003             {
11004               if (VECTOR_MODE_P (mode))
11005                 {
11006                   /* Vector shift (immediate).  */
11007                   *cost += extra_cost->vect.alu;
11008                 }
11009               else
11010                 {
11011                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
11012                      aliases.  */
11013                   *cost += extra_cost->alu.shift;
11014                 }
11015             }
11016
11017           /* We can incorporate zero/sign extend for free.  */
11018           if (GET_CODE (op0) == ZERO_EXTEND
11019               || GET_CODE (op0) == SIGN_EXTEND)
11020             op0 = XEXP (op0, 0);
11021
11022           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
11023           return true;
11024         }
11025       else
11026         {
11027           if (VECTOR_MODE_P (mode))
11028             {
11029               if (speed)
11030                 /* Vector shift (register).  */
11031                 *cost += extra_cost->vect.alu;
11032             }
11033           else
11034             {
11035               if (speed)
11036                 /* LSLV.  */
11037                 *cost += extra_cost->alu.shift_reg;
11038
11039               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11040                   && CONST_INT_P (XEXP (op1, 1))
11041                   && known_eq (INTVAL (XEXP (op1, 1)),
11042                                GET_MODE_BITSIZE (mode) - 1))
11043                 {
11044                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11045                   /* We already demanded XEXP (op1, 0) to be REG_P, so
11046                      don't recurse into it.  */
11047                   return true;
11048                 }
11049             }
11050           return false;  /* All arguments need to be in registers.  */
11051         }
11052
11053     case ROTATE:
11054     case ROTATERT:
11055     case LSHIFTRT:
11056     case ASHIFTRT:
11057       op0 = XEXP (x, 0);
11058       op1 = XEXP (x, 1);
11059
11060       if (CONST_INT_P (op1))
11061         {
11062           /* ASR (immediate) and friends.  */
11063           if (speed)
11064             {
11065               if (VECTOR_MODE_P (mode))
11066                 *cost += extra_cost->vect.alu;
11067               else
11068                 *cost += extra_cost->alu.shift;
11069             }
11070
11071           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
11072           return true;
11073         }
11074       else
11075         {
11076           if (VECTOR_MODE_P (mode))
11077             {
11078               if (speed)
11079                 /* Vector shift (register).  */
11080                 *cost += extra_cost->vect.alu;
11081             }
11082           else
11083             {
11084               if (speed)
11085                 /* ASR (register) and friends.  */
11086                 *cost += extra_cost->alu.shift_reg;
11087
11088               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11089                   && CONST_INT_P (XEXP (op1, 1))
11090                   && known_eq (INTVAL (XEXP (op1, 1)),
11091                                GET_MODE_BITSIZE (mode) - 1))
11092                 {
11093                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11094                   /* We already demanded XEXP (op1, 0) to be REG_P, so
11095                      don't recurse into it.  */
11096                   return true;
11097                 }
11098             }
11099           return false;  /* All arguments need to be in registers.  */
11100         }
11101
11102     case SYMBOL_REF:
11103
11104       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
11105           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
11106         {
11107           /* LDR.  */
11108           if (speed)
11109             *cost += extra_cost->ldst.load;
11110         }
11111       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
11112                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
11113         {
11114           /* ADRP, followed by ADD.  */
11115           *cost += COSTS_N_INSNS (1);
11116           if (speed)
11117             *cost += 2 * extra_cost->alu.arith;
11118         }
11119       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
11120                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11121         {
11122           /* ADR.  */
11123           if (speed)
11124             *cost += extra_cost->alu.arith;
11125         }
11126
11127       if (flag_pic)
11128         {
11129           /* One extra load instruction, after accessing the GOT.  */
11130           *cost += COSTS_N_INSNS (1);
11131           if (speed)
11132             *cost += extra_cost->ldst.load;
11133         }
11134       return true;
11135
11136     case HIGH:
11137     case LO_SUM:
11138       /* ADRP/ADD (immediate).  */
11139       if (speed)
11140         *cost += extra_cost->alu.arith;
11141       return true;
11142
11143     case ZERO_EXTRACT:
11144     case SIGN_EXTRACT:
11145       /* UBFX/SBFX.  */
11146       if (speed)
11147         {
11148           if (VECTOR_MODE_P (mode))
11149             *cost += extra_cost->vect.alu;
11150           else
11151             *cost += extra_cost->alu.bfx;
11152         }
11153
11154       /* We can trust that the immediates used will be correct (there
11155          are no by-register forms), so we need only cost op0.  */
11156       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
11157       return true;
11158
11159     case MULT:
11160       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
11161       /* aarch64_rtx_mult_cost always handles recursion to its
11162          operands.  */
11163       return true;
11164
11165     case MOD:
11166     /* We can expand signed mod by power of 2 using a NEGS, two parallel
11167        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
11168        an unconditional negate.  This case should only ever be reached through
11169        the set_smod_pow2_cheap check in expmed.c.  */
11170       if (CONST_INT_P (XEXP (x, 1))
11171           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
11172           && (mode == SImode || mode == DImode))
11173         {
11174           /* We expand to 4 instructions.  Reset the baseline.  */
11175           *cost = COSTS_N_INSNS (4);
11176
11177           if (speed)
11178             *cost += 2 * extra_cost->alu.logical
11179                      + 2 * extra_cost->alu.arith;
11180
11181           return true;
11182         }
11183
11184     /* Fall-through.  */
11185     case UMOD:
11186       if (speed)
11187         {
11188           /* Slighly prefer UMOD over SMOD.  */
11189           if (VECTOR_MODE_P (mode))
11190             *cost += extra_cost->vect.alu;
11191           else if (GET_MODE_CLASS (mode) == MODE_INT)
11192             *cost += (extra_cost->mult[mode == DImode].add
11193                       + extra_cost->mult[mode == DImode].idiv
11194                       + (code == MOD ? 1 : 0));
11195         }
11196       return false;  /* All arguments need to be in registers.  */
11197
11198     case DIV:
11199     case UDIV:
11200     case SQRT:
11201       if (speed)
11202         {
11203           if (VECTOR_MODE_P (mode))
11204             *cost += extra_cost->vect.alu;
11205           else if (GET_MODE_CLASS (mode) == MODE_INT)
11206             /* There is no integer SQRT, so only DIV and UDIV can get
11207                here.  */
11208             *cost += (extra_cost->mult[mode == DImode].idiv
11209                      /* Slighly prefer UDIV over SDIV.  */
11210                      + (code == DIV ? 1 : 0));
11211           else
11212             *cost += extra_cost->fp[mode == DFmode].div;
11213         }
11214       return false;  /* All arguments need to be in registers.  */
11215
11216     case IF_THEN_ELSE:
11217       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
11218                                          XEXP (x, 2), cost, speed);
11219
11220     case EQ:
11221     case NE:
11222     case GT:
11223     case GTU:
11224     case LT:
11225     case LTU:
11226     case GE:
11227     case GEU:
11228     case LE:
11229     case LEU:
11230
11231       return false; /* All arguments must be in registers.  */
11232
11233     case FMA:
11234       op0 = XEXP (x, 0);
11235       op1 = XEXP (x, 1);
11236       op2 = XEXP (x, 2);
11237
11238       if (speed)
11239         {
11240           if (VECTOR_MODE_P (mode))
11241             *cost += extra_cost->vect.alu;
11242           else
11243             *cost += extra_cost->fp[mode == DFmode].fma;
11244         }
11245
11246       /* FMSUB, FNMADD, and FNMSUB are free.  */
11247       if (GET_CODE (op0) == NEG)
11248         op0 = XEXP (op0, 0);
11249
11250       if (GET_CODE (op2) == NEG)
11251         op2 = XEXP (op2, 0);
11252
11253       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
11254          and the by-element operand as operand 0.  */
11255       if (GET_CODE (op1) == NEG)
11256         op1 = XEXP (op1, 0);
11257
11258       /* Catch vector-by-element operations.  The by-element operand can
11259          either be (vec_duplicate (vec_select (x))) or just
11260          (vec_select (x)), depending on whether we are multiplying by
11261          a vector or a scalar.
11262
11263          Canonicalization is not very good in these cases, FMA4 will put the
11264          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
11265       if (GET_CODE (op0) == VEC_DUPLICATE)
11266         op0 = XEXP (op0, 0);
11267       else if (GET_CODE (op1) == VEC_DUPLICATE)
11268         op1 = XEXP (op1, 0);
11269
11270       if (GET_CODE (op0) == VEC_SELECT)
11271         op0 = XEXP (op0, 0);
11272       else if (GET_CODE (op1) == VEC_SELECT)
11273         op1 = XEXP (op1, 0);
11274
11275       /* If the remaining parameters are not registers,
11276          get the cost to put them into registers.  */
11277       *cost += rtx_cost (op0, mode, FMA, 0, speed);
11278       *cost += rtx_cost (op1, mode, FMA, 1, speed);
11279       *cost += rtx_cost (op2, mode, FMA, 2, speed);
11280       return true;
11281
11282     case FLOAT:
11283     case UNSIGNED_FLOAT:
11284       if (speed)
11285         *cost += extra_cost->fp[mode == DFmode].fromint;
11286       return false;
11287
11288     case FLOAT_EXTEND:
11289       if (speed)
11290         {
11291           if (VECTOR_MODE_P (mode))
11292             {
11293               /*Vector truncate.  */
11294               *cost += extra_cost->vect.alu;
11295             }
11296           else
11297             *cost += extra_cost->fp[mode == DFmode].widen;
11298         }
11299       return false;
11300
11301     case FLOAT_TRUNCATE:
11302       if (speed)
11303         {
11304           if (VECTOR_MODE_P (mode))
11305             {
11306               /*Vector conversion.  */
11307               *cost += extra_cost->vect.alu;
11308             }
11309           else
11310             *cost += extra_cost->fp[mode == DFmode].narrow;
11311         }
11312       return false;
11313
11314     case FIX:
11315     case UNSIGNED_FIX:
11316       x = XEXP (x, 0);
11317       /* Strip the rounding part.  They will all be implemented
11318          by the fcvt* family of instructions anyway.  */
11319       if (GET_CODE (x) == UNSPEC)
11320         {
11321           unsigned int uns_code = XINT (x, 1);
11322
11323           if (uns_code == UNSPEC_FRINTA
11324               || uns_code == UNSPEC_FRINTM
11325               || uns_code == UNSPEC_FRINTN
11326               || uns_code == UNSPEC_FRINTP
11327               || uns_code == UNSPEC_FRINTZ)
11328             x = XVECEXP (x, 0, 0);
11329         }
11330
11331       if (speed)
11332         {
11333           if (VECTOR_MODE_P (mode))
11334             *cost += extra_cost->vect.alu;
11335           else
11336             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
11337         }
11338
11339       /* We can combine fmul by a power of 2 followed by a fcvt into a single
11340          fixed-point fcvt.  */
11341       if (GET_CODE (x) == MULT
11342           && ((VECTOR_MODE_P (mode)
11343                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
11344               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
11345         {
11346           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
11347                              0, speed);
11348           return true;
11349         }
11350
11351       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
11352       return true;
11353
11354     case ABS:
11355       if (VECTOR_MODE_P (mode))
11356         {
11357           /* ABS (vector).  */
11358           if (speed)
11359             *cost += extra_cost->vect.alu;
11360         }
11361       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11362         {
11363           op0 = XEXP (x, 0);
11364
11365           /* FABD, which is analogous to FADD.  */
11366           if (GET_CODE (op0) == MINUS)
11367             {
11368               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
11369               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
11370               if (speed)
11371                 *cost += extra_cost->fp[mode == DFmode].addsub;
11372
11373               return true;
11374             }
11375           /* Simple FABS is analogous to FNEG.  */
11376           if (speed)
11377             *cost += extra_cost->fp[mode == DFmode].neg;
11378         }
11379       else
11380         {
11381           /* Integer ABS will either be split to
11382              two arithmetic instructions, or will be an ABS
11383              (scalar), which we don't model.  */
11384           *cost = COSTS_N_INSNS (2);
11385           if (speed)
11386             *cost += 2 * extra_cost->alu.arith;
11387         }
11388       return false;
11389
11390     case SMAX:
11391     case SMIN:
11392       if (speed)
11393         {
11394           if (VECTOR_MODE_P (mode))
11395             *cost += extra_cost->vect.alu;
11396           else
11397             {
11398               /* FMAXNM/FMINNM/FMAX/FMIN.
11399                  TODO: This may not be accurate for all implementations, but
11400                  we do not model this in the cost tables.  */
11401               *cost += extra_cost->fp[mode == DFmode].addsub;
11402             }
11403         }
11404       return false;
11405
11406     case UNSPEC:
11407       /* The floating point round to integer frint* instructions.  */
11408       if (aarch64_frint_unspec_p (XINT (x, 1)))
11409         {
11410           if (speed)
11411             *cost += extra_cost->fp[mode == DFmode].roundint;
11412
11413           return false;
11414         }
11415
11416       if (XINT (x, 1) == UNSPEC_RBIT)
11417         {
11418           if (speed)
11419             *cost += extra_cost->alu.rev;
11420
11421           return false;
11422         }
11423       break;
11424
11425     case TRUNCATE:
11426
11427       /* Decompose <su>muldi3_highpart.  */
11428       if (/* (truncate:DI  */
11429           mode == DImode
11430           /*   (lshiftrt:TI  */
11431           && GET_MODE (XEXP (x, 0)) == TImode
11432           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
11433           /*      (mult:TI  */
11434           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
11435           /*        (ANY_EXTEND:TI (reg:DI))
11436                     (ANY_EXTEND:TI (reg:DI)))  */
11437           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
11438                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
11439               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
11440                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
11441           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
11442           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
11443           /*     (const_int 64)  */
11444           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
11445           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
11446         {
11447           /* UMULH/SMULH.  */
11448           if (speed)
11449             *cost += extra_cost->mult[mode == DImode].extend;
11450           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
11451                              mode, MULT, 0, speed);
11452           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
11453                              mode, MULT, 1, speed);
11454           return true;
11455         }
11456
11457       /* Fall through.  */
11458     default:
11459       break;
11460     }
11461
11462   if (dump_file
11463       && flag_aarch64_verbose_cost)
11464     fprintf (dump_file,
11465       "\nFailed to cost RTX.  Assuming default cost.\n");
11466
11467   return true;
11468 }
11469
11470 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
11471    calculated for X.  This cost is stored in *COST.  Returns true
11472    if the total cost of X was calculated.  */
11473 static bool
11474 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
11475                    int param, int *cost, bool speed)
11476 {
11477   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
11478
11479   if (dump_file
11480       && flag_aarch64_verbose_cost)
11481     {
11482       print_rtl_single (dump_file, x);
11483       fprintf (dump_file, "\n%s cost: %d (%s)\n",
11484                speed ? "Hot" : "Cold",
11485                *cost, result ? "final" : "partial");
11486     }
11487
11488   return result;
11489 }
11490
11491 static int
11492 aarch64_register_move_cost (machine_mode mode,
11493                             reg_class_t from_i, reg_class_t to_i)
11494 {
11495   enum reg_class from = (enum reg_class) from_i;
11496   enum reg_class to = (enum reg_class) to_i;
11497   const struct cpu_regmove_cost *regmove_cost
11498     = aarch64_tune_params.regmove_cost;
11499
11500   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
11501   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
11502     to = GENERAL_REGS;
11503
11504   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
11505     from = GENERAL_REGS;
11506
11507   /* Moving between GPR and stack cost is the same as GP2GP.  */
11508   if ((from == GENERAL_REGS && to == STACK_REG)
11509       || (to == GENERAL_REGS && from == STACK_REG))
11510     return regmove_cost->GP2GP;
11511
11512   /* To/From the stack register, we move via the gprs.  */
11513   if (to == STACK_REG || from == STACK_REG)
11514     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
11515             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
11516
11517   if (known_eq (GET_MODE_SIZE (mode), 16))
11518     {
11519       /* 128-bit operations on general registers require 2 instructions.  */
11520       if (from == GENERAL_REGS && to == GENERAL_REGS)
11521         return regmove_cost->GP2GP * 2;
11522       else if (from == GENERAL_REGS)
11523         return regmove_cost->GP2FP * 2;
11524       else if (to == GENERAL_REGS)
11525         return regmove_cost->FP2GP * 2;
11526
11527       /* When AdvSIMD instructions are disabled it is not possible to move
11528          a 128-bit value directly between Q registers.  This is handled in
11529          secondary reload.  A general register is used as a scratch to move
11530          the upper DI value and the lower DI value is moved directly,
11531          hence the cost is the sum of three moves. */
11532       if (! TARGET_SIMD)
11533         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
11534
11535       return regmove_cost->FP2FP;
11536     }
11537
11538   if (from == GENERAL_REGS && to == GENERAL_REGS)
11539     return regmove_cost->GP2GP;
11540   else if (from == GENERAL_REGS)
11541     return regmove_cost->GP2FP;
11542   else if (to == GENERAL_REGS)
11543     return regmove_cost->FP2GP;
11544
11545   return regmove_cost->FP2FP;
11546 }
11547
11548 static int
11549 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
11550                           reg_class_t rclass ATTRIBUTE_UNUSED,
11551                           bool in ATTRIBUTE_UNUSED)
11552 {
11553   return aarch64_tune_params.memmov_cost;
11554 }
11555
11556 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
11557    to optimize 1.0/sqrt.  */
11558
11559 static bool
11560 use_rsqrt_p (machine_mode mode)
11561 {
11562   return (!flag_trapping_math
11563           && flag_unsafe_math_optimizations
11564           && ((aarch64_tune_params.approx_modes->recip_sqrt
11565                & AARCH64_APPROX_MODE (mode))
11566               || flag_mrecip_low_precision_sqrt));
11567 }
11568
11569 /* Function to decide when to use the approximate reciprocal square root
11570    builtin.  */
11571
11572 static tree
11573 aarch64_builtin_reciprocal (tree fndecl)
11574 {
11575   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
11576
11577   if (!use_rsqrt_p (mode))
11578     return NULL_TREE;
11579   return aarch64_builtin_rsqrt (DECL_MD_FUNCTION_CODE (fndecl));
11580 }
11581
11582 /* Emit instruction sequence to compute either the approximate square root
11583    or its approximate reciprocal, depending on the flag RECP, and return
11584    whether the sequence was emitted or not.  */
11585
11586 bool
11587 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
11588 {
11589   machine_mode mode = GET_MODE (dst);
11590
11591   if (GET_MODE_INNER (mode) == HFmode)
11592     {
11593       gcc_assert (!recp);
11594       return false;
11595     }
11596
11597   if (!recp)
11598     {
11599       if (!(flag_mlow_precision_sqrt
11600             || (aarch64_tune_params.approx_modes->sqrt
11601                 & AARCH64_APPROX_MODE (mode))))
11602         return false;
11603
11604       if (flag_finite_math_only
11605           || flag_trapping_math
11606           || !flag_unsafe_math_optimizations
11607           || optimize_function_for_size_p (cfun))
11608         return false;
11609     }
11610   else
11611     /* Caller assumes we cannot fail.  */
11612     gcc_assert (use_rsqrt_p (mode));
11613
11614   machine_mode mmsk = mode_for_int_vector (mode).require ();
11615   rtx xmsk = gen_reg_rtx (mmsk);
11616   if (!recp)
11617     /* When calculating the approximate square root, compare the
11618        argument with 0.0 and create a mask.  */
11619     emit_insn (gen_rtx_SET (xmsk,
11620                             gen_rtx_NEG (mmsk,
11621                                          gen_rtx_EQ (mmsk, src,
11622                                                      CONST0_RTX (mode)))));
11623
11624   /* Estimate the approximate reciprocal square root.  */
11625   rtx xdst = gen_reg_rtx (mode);
11626   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
11627
11628   /* Iterate over the series twice for SF and thrice for DF.  */
11629   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11630
11631   /* Optionally iterate over the series once less for faster performance
11632      while sacrificing the accuracy.  */
11633   if ((recp && flag_mrecip_low_precision_sqrt)
11634       || (!recp && flag_mlow_precision_sqrt))
11635     iterations--;
11636
11637   /* Iterate over the series to calculate the approximate reciprocal square
11638      root.  */
11639   rtx x1 = gen_reg_rtx (mode);
11640   while (iterations--)
11641     {
11642       rtx x2 = gen_reg_rtx (mode);
11643       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
11644
11645       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
11646
11647       if (iterations > 0)
11648         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
11649     }
11650
11651   if (!recp)
11652     {
11653       /* Qualify the approximate reciprocal square root when the argument is
11654          0.0 by squashing the intermediary result to 0.0.  */
11655       rtx xtmp = gen_reg_rtx (mmsk);
11656       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
11657                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
11658       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
11659
11660       /* Calculate the approximate square root.  */
11661       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
11662     }
11663
11664   /* Finalize the approximation.  */
11665   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
11666
11667   return true;
11668 }
11669
11670 /* Emit the instruction sequence to compute the approximation for the division
11671    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
11672
11673 bool
11674 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
11675 {
11676   machine_mode mode = GET_MODE (quo);
11677
11678   if (GET_MODE_INNER (mode) == HFmode)
11679     return false;
11680
11681   bool use_approx_division_p = (flag_mlow_precision_div
11682                                 || (aarch64_tune_params.approx_modes->division
11683                                     & AARCH64_APPROX_MODE (mode)));
11684
11685   if (!flag_finite_math_only
11686       || flag_trapping_math
11687       || !flag_unsafe_math_optimizations
11688       || optimize_function_for_size_p (cfun)
11689       || !use_approx_division_p)
11690     return false;
11691
11692   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
11693     return false;
11694
11695   /* Estimate the approximate reciprocal.  */
11696   rtx xrcp = gen_reg_rtx (mode);
11697   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
11698
11699   /* Iterate over the series twice for SF and thrice for DF.  */
11700   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11701
11702   /* Optionally iterate over the series once less for faster performance,
11703      while sacrificing the accuracy.  */
11704   if (flag_mlow_precision_div)
11705     iterations--;
11706
11707   /* Iterate over the series to calculate the approximate reciprocal.  */
11708   rtx xtmp = gen_reg_rtx (mode);
11709   while (iterations--)
11710     {
11711       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
11712
11713       if (iterations > 0)
11714         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
11715     }
11716
11717   if (num != CONST1_RTX (mode))
11718     {
11719       /* As the approximate reciprocal of DEN is already calculated, only
11720          calculate the approximate division when NUM is not 1.0.  */
11721       rtx xnum = force_reg (mode, num);
11722       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
11723     }
11724
11725   /* Finalize the approximation.  */
11726   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
11727   return true;
11728 }
11729
11730 /* Return the number of instructions that can be issued per cycle.  */
11731 static int
11732 aarch64_sched_issue_rate (void)
11733 {
11734   return aarch64_tune_params.issue_rate;
11735 }
11736
11737 static int
11738 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
11739 {
11740   int issue_rate = aarch64_sched_issue_rate ();
11741
11742   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
11743 }
11744
11745
11746 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
11747    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
11748    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
11749
11750 static int
11751 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
11752                                                     int ready_index)
11753 {
11754   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
11755 }
11756
11757
11758 /* Vectorizer cost model target hooks.  */
11759
11760 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
11761 static int
11762 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
11763                                     tree vectype,
11764                                     int misalign ATTRIBUTE_UNUSED)
11765 {
11766   unsigned elements;
11767   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
11768   bool fp = false;
11769
11770   if (vectype != NULL)
11771     fp = FLOAT_TYPE_P (vectype);
11772
11773   switch (type_of_cost)
11774     {
11775       case scalar_stmt:
11776         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
11777
11778       case scalar_load:
11779         return costs->scalar_load_cost;
11780
11781       case scalar_store:
11782         return costs->scalar_store_cost;
11783
11784       case vector_stmt:
11785         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11786
11787       case vector_load:
11788         return costs->vec_align_load_cost;
11789
11790       case vector_store:
11791         return costs->vec_store_cost;
11792
11793       case vec_to_scalar:
11794         return costs->vec_to_scalar_cost;
11795
11796       case scalar_to_vec:
11797         return costs->scalar_to_vec_cost;
11798
11799       case unaligned_load:
11800       case vector_gather_load:
11801         return costs->vec_unalign_load_cost;
11802
11803       case unaligned_store:
11804       case vector_scatter_store:
11805         return costs->vec_unalign_store_cost;
11806
11807       case cond_branch_taken:
11808         return costs->cond_taken_branch_cost;
11809
11810       case cond_branch_not_taken:
11811         return costs->cond_not_taken_branch_cost;
11812
11813       case vec_perm:
11814         return costs->vec_permute_cost;
11815
11816       case vec_promote_demote:
11817         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11818
11819       case vec_construct:
11820         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
11821         return elements / 2 + 1;
11822
11823       default:
11824         gcc_unreachable ();
11825     }
11826 }
11827
11828 /* Implement targetm.vectorize.add_stmt_cost.  */
11829 static unsigned
11830 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
11831                        struct _stmt_vec_info *stmt_info, int misalign,
11832                        enum vect_cost_model_location where)
11833 {
11834   unsigned *cost = (unsigned *) data;
11835   unsigned retval = 0;
11836
11837   if (flag_vect_cost_model)
11838     {
11839       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
11840       int stmt_cost =
11841             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
11842
11843       /* Statements in an inner loop relative to the loop being
11844          vectorized are weighted more heavily.  The value here is
11845          arbitrary and could potentially be improved with analysis.  */
11846       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
11847         count *= 50; /*  FIXME  */
11848
11849       retval = (unsigned) (count * stmt_cost);
11850       cost[where] += retval;
11851     }
11852
11853   return retval;
11854 }
11855
11856 static void initialize_aarch64_code_model (struct gcc_options *);
11857
11858 /* Parse the TO_PARSE string and put the architecture struct that it
11859    selects into RES and the architectural features into ISA_FLAGS.
11860    Return an aarch64_parse_opt_result describing the parse result.
11861    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11862    When the TO_PARSE string contains an invalid extension,
11863    a copy of the string is created and stored to INVALID_EXTENSION.  */
11864
11865 static enum aarch64_parse_opt_result
11866 aarch64_parse_arch (const char *to_parse, const struct processor **res,
11867                     uint64_t *isa_flags, std::string *invalid_extension)
11868 {
11869   const char *ext;
11870   const struct processor *arch;
11871   size_t len;
11872
11873   ext = strchr (to_parse, '+');
11874
11875   if (ext != NULL)
11876     len = ext - to_parse;
11877   else
11878     len = strlen (to_parse);
11879
11880   if (len == 0)
11881     return AARCH64_PARSE_MISSING_ARG;
11882
11883
11884   /* Loop through the list of supported ARCHes to find a match.  */
11885   for (arch = all_architectures; arch->name != NULL; arch++)
11886     {
11887       if (strlen (arch->name) == len
11888           && strncmp (arch->name, to_parse, len) == 0)
11889         {
11890           uint64_t isa_temp = arch->flags;
11891
11892           if (ext != NULL)
11893             {
11894               /* TO_PARSE string contains at least one extension.  */
11895               enum aarch64_parse_opt_result ext_res
11896                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11897
11898               if (ext_res != AARCH64_PARSE_OK)
11899                 return ext_res;
11900             }
11901           /* Extension parsing was successful.  Confirm the result
11902              arch and ISA flags.  */
11903           *res = arch;
11904           *isa_flags = isa_temp;
11905           return AARCH64_PARSE_OK;
11906         }
11907     }
11908
11909   /* ARCH name not found in list.  */
11910   return AARCH64_PARSE_INVALID_ARG;
11911 }
11912
11913 /* Parse the TO_PARSE string and put the result tuning in RES and the
11914    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
11915    describing the parse result.  If there is an error parsing, RES and
11916    ISA_FLAGS are left unchanged.
11917    When the TO_PARSE string contains an invalid extension,
11918    a copy of the string is created and stored to INVALID_EXTENSION.  */
11919
11920 static enum aarch64_parse_opt_result
11921 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
11922                    uint64_t *isa_flags, std::string *invalid_extension)
11923 {
11924   const char *ext;
11925   const struct processor *cpu;
11926   size_t len;
11927
11928   ext = strchr (to_parse, '+');
11929
11930   if (ext != NULL)
11931     len = ext - to_parse;
11932   else
11933     len = strlen (to_parse);
11934
11935   if (len == 0)
11936     return AARCH64_PARSE_MISSING_ARG;
11937
11938
11939   /* Loop through the list of supported CPUs to find a match.  */
11940   for (cpu = all_cores; cpu->name != NULL; cpu++)
11941     {
11942       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
11943         {
11944           uint64_t isa_temp = cpu->flags;
11945
11946
11947           if (ext != NULL)
11948             {
11949               /* TO_PARSE string contains at least one extension.  */
11950               enum aarch64_parse_opt_result ext_res
11951                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11952
11953               if (ext_res != AARCH64_PARSE_OK)
11954                 return ext_res;
11955             }
11956           /* Extension parsing was successfull.  Confirm the result
11957              cpu and ISA flags.  */
11958           *res = cpu;
11959           *isa_flags = isa_temp;
11960           return AARCH64_PARSE_OK;
11961         }
11962     }
11963
11964   /* CPU name not found in list.  */
11965   return AARCH64_PARSE_INVALID_ARG;
11966 }
11967
11968 /* Parse the TO_PARSE string and put the cpu it selects into RES.
11969    Return an aarch64_parse_opt_result describing the parse result.
11970    If the parsing fails the RES does not change.  */
11971
11972 static enum aarch64_parse_opt_result
11973 aarch64_parse_tune (const char *to_parse, const struct processor **res)
11974 {
11975   const struct processor *cpu;
11976
11977   /* Loop through the list of supported CPUs to find a match.  */
11978   for (cpu = all_cores; cpu->name != NULL; cpu++)
11979     {
11980       if (strcmp (cpu->name, to_parse) == 0)
11981         {
11982           *res = cpu;
11983           return AARCH64_PARSE_OK;
11984         }
11985     }
11986
11987   /* CPU name not found in list.  */
11988   return AARCH64_PARSE_INVALID_ARG;
11989 }
11990
11991 /* Parse TOKEN, which has length LENGTH to see if it is an option
11992    described in FLAG.  If it is, return the index bit for that fusion type.
11993    If not, error (printing OPTION_NAME) and return zero.  */
11994
11995 static unsigned int
11996 aarch64_parse_one_option_token (const char *token,
11997                                 size_t length,
11998                                 const struct aarch64_flag_desc *flag,
11999                                 const char *option_name)
12000 {
12001   for (; flag->name != NULL; flag++)
12002     {
12003       if (length == strlen (flag->name)
12004           && !strncmp (flag->name, token, length))
12005         return flag->flag;
12006     }
12007
12008   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
12009   return 0;
12010 }
12011
12012 /* Parse OPTION which is a comma-separated list of flags to enable.
12013    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
12014    default state we inherit from the CPU tuning structures.  OPTION_NAME
12015    gives the top-level option we are parsing in the -moverride string,
12016    for use in error messages.  */
12017
12018 static unsigned int
12019 aarch64_parse_boolean_options (const char *option,
12020                                const struct aarch64_flag_desc *flags,
12021                                unsigned int initial_state,
12022                                const char *option_name)
12023 {
12024   const char separator = '.';
12025   const char* specs = option;
12026   const char* ntoken = option;
12027   unsigned int found_flags = initial_state;
12028
12029   while ((ntoken = strchr (specs, separator)))
12030     {
12031       size_t token_length = ntoken - specs;
12032       unsigned token_ops = aarch64_parse_one_option_token (specs,
12033                                                            token_length,
12034                                                            flags,
12035                                                            option_name);
12036       /* If we find "none" (or, for simplicity's sake, an error) anywhere
12037          in the token stream, reset the supported operations.  So:
12038
12039            adrp+add.cmp+branch.none.adrp+add
12040
12041            would have the result of turning on only adrp+add fusion.  */
12042       if (!token_ops)
12043         found_flags = 0;
12044
12045       found_flags |= token_ops;
12046       specs = ++ntoken;
12047     }
12048
12049   /* We ended with a comma, print something.  */
12050   if (!(*specs))
12051     {
12052       error ("%s string ill-formed\n", option_name);
12053       return 0;
12054     }
12055
12056   /* We still have one more token to parse.  */
12057   size_t token_length = strlen (specs);
12058   unsigned token_ops = aarch64_parse_one_option_token (specs,
12059                                                        token_length,
12060                                                        flags,
12061                                                        option_name);
12062    if (!token_ops)
12063      found_flags = 0;
12064
12065   found_flags |= token_ops;
12066   return found_flags;
12067 }
12068
12069 /* Support for overriding instruction fusion.  */
12070
12071 static void
12072 aarch64_parse_fuse_string (const char *fuse_string,
12073                             struct tune_params *tune)
12074 {
12075   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
12076                                                      aarch64_fusible_pairs,
12077                                                      tune->fusible_ops,
12078                                                      "fuse=");
12079 }
12080
12081 /* Support for overriding other tuning flags.  */
12082
12083 static void
12084 aarch64_parse_tune_string (const char *tune_string,
12085                             struct tune_params *tune)
12086 {
12087   tune->extra_tuning_flags
12088     = aarch64_parse_boolean_options (tune_string,
12089                                      aarch64_tuning_flags,
12090                                      tune->extra_tuning_flags,
12091                                      "tune=");
12092 }
12093
12094 /* Parse the sve_width tuning moverride string in TUNE_STRING.
12095    Accept the valid SVE vector widths allowed by
12096    aarch64_sve_vector_bits_enum and use it to override sve_width
12097    in TUNE.  */
12098
12099 static void
12100 aarch64_parse_sve_width_string (const char *tune_string,
12101                                 struct tune_params *tune)
12102 {
12103   int width = -1;
12104
12105   int n = sscanf (tune_string, "%d", &width);
12106   if (n == EOF)
12107     {
12108       error ("invalid format for sve_width");
12109       return;
12110     }
12111   switch (width)
12112     {
12113     case SVE_128:
12114     case SVE_256:
12115     case SVE_512:
12116     case SVE_1024:
12117     case SVE_2048:
12118       break;
12119     default:
12120       error ("invalid sve_width value: %d", width);
12121     }
12122   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
12123 }
12124
12125 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
12126    we understand.  If it is, extract the option string and handoff to
12127    the appropriate function.  */
12128
12129 void
12130 aarch64_parse_one_override_token (const char* token,
12131                                   size_t length,
12132                                   struct tune_params *tune)
12133 {
12134   const struct aarch64_tuning_override_function *fn
12135     = aarch64_tuning_override_functions;
12136
12137   const char *option_part = strchr (token, '=');
12138   if (!option_part)
12139     {
12140       error ("tuning string missing in option (%s)", token);
12141       return;
12142     }
12143
12144   /* Get the length of the option name.  */
12145   length = option_part - token;
12146   /* Skip the '=' to get to the option string.  */
12147   option_part++;
12148
12149   for (; fn->name != NULL; fn++)
12150     {
12151       if (!strncmp (fn->name, token, length))
12152         {
12153           fn->parse_override (option_part, tune);
12154           return;
12155         }
12156     }
12157
12158   error ("unknown tuning option (%s)",token);
12159   return;
12160 }
12161
12162 /* A checking mechanism for the implementation of the tls size.  */
12163
12164 static void
12165 initialize_aarch64_tls_size (struct gcc_options *opts)
12166 {
12167   if (aarch64_tls_size == 0)
12168     aarch64_tls_size = 24;
12169
12170   switch (opts->x_aarch64_cmodel_var)
12171     {
12172     case AARCH64_CMODEL_TINY:
12173       /* Both the default and maximum TLS size allowed under tiny is 1M which
12174          needs two instructions to address, so we clamp the size to 24.  */
12175       if (aarch64_tls_size > 24)
12176         aarch64_tls_size = 24;
12177       break;
12178     case AARCH64_CMODEL_SMALL:
12179       /* The maximum TLS size allowed under small is 4G.  */
12180       if (aarch64_tls_size > 32)
12181         aarch64_tls_size = 32;
12182       break;
12183     case AARCH64_CMODEL_LARGE:
12184       /* The maximum TLS size allowed under large is 16E.
12185          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
12186       if (aarch64_tls_size > 48)
12187         aarch64_tls_size = 48;
12188       break;
12189     default:
12190       gcc_unreachable ();
12191     }
12192
12193   return;
12194 }
12195
12196 /* Parse STRING looking for options in the format:
12197      string     :: option:string
12198      option     :: name=substring
12199      name       :: {a-z}
12200      substring  :: defined by option.  */
12201
12202 static void
12203 aarch64_parse_override_string (const char* input_string,
12204                                struct tune_params* tune)
12205 {
12206   const char separator = ':';
12207   size_t string_length = strlen (input_string) + 1;
12208   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
12209   char *string = string_root;
12210   strncpy (string, input_string, string_length);
12211   string[string_length - 1] = '\0';
12212
12213   char* ntoken = string;
12214
12215   while ((ntoken = strchr (string, separator)))
12216     {
12217       size_t token_length = ntoken - string;
12218       /* Make this substring look like a string.  */
12219       *ntoken = '\0';
12220       aarch64_parse_one_override_token (string, token_length, tune);
12221       string = ++ntoken;
12222     }
12223
12224   /* One last option to parse.  */
12225   aarch64_parse_one_override_token (string, strlen (string), tune);
12226   free (string_root);
12227 }
12228
12229
12230 static void
12231 aarch64_override_options_after_change_1 (struct gcc_options *opts)
12232 {
12233   if (accepted_branch_protection_string)
12234     {
12235       opts->x_aarch64_branch_protection_string
12236         = xstrdup (accepted_branch_protection_string);
12237     }
12238
12239   /* PR 70044: We have to be careful about being called multiple times for the
12240      same function.  This means all changes should be repeatable.  */
12241
12242   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
12243      Disable the frame pointer flag so the mid-end will not use a frame
12244      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
12245      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
12246      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
12247   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
12248   if (opts->x_flag_omit_frame_pointer == 0)
12249     opts->x_flag_omit_frame_pointer = 2;
12250
12251   /* If not optimizing for size, set the default
12252      alignment to what the target wants.  */
12253   if (!opts->x_optimize_size)
12254     {
12255       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
12256         opts->x_str_align_loops = aarch64_tune_params.loop_align;
12257       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
12258         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
12259       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
12260         opts->x_str_align_functions = aarch64_tune_params.function_align;
12261     }
12262
12263   /* We default to no pc-relative literal loads.  */
12264
12265   aarch64_pcrelative_literal_loads = false;
12266
12267   /* If -mpc-relative-literal-loads is set on the command line, this
12268      implies that the user asked for PC relative literal loads.  */
12269   if (opts->x_pcrelative_literal_loads == 1)
12270     aarch64_pcrelative_literal_loads = true;
12271
12272   /* In the tiny memory model it makes no sense to disallow PC relative
12273      literal pool loads.  */
12274   if (aarch64_cmodel == AARCH64_CMODEL_TINY
12275       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
12276     aarch64_pcrelative_literal_loads = true;
12277
12278   /* When enabling the lower precision Newton series for the square root, also
12279      enable it for the reciprocal square root, since the latter is an
12280      intermediary step for the former.  */
12281   if (flag_mlow_precision_sqrt)
12282     flag_mrecip_low_precision_sqrt = true;
12283 }
12284
12285 /* 'Unpack' up the internal tuning structs and update the options
12286     in OPTS.  The caller must have set up selected_tune and selected_arch
12287     as all the other target-specific codegen decisions are
12288     derived from them.  */
12289
12290 void
12291 aarch64_override_options_internal (struct gcc_options *opts)
12292 {
12293   aarch64_tune_flags = selected_tune->flags;
12294   aarch64_tune = selected_tune->sched_core;
12295   /* Make a copy of the tuning parameters attached to the core, which
12296      we may later overwrite.  */
12297   aarch64_tune_params = *(selected_tune->tune);
12298   aarch64_architecture_version = selected_arch->architecture_version;
12299
12300   if (opts->x_aarch64_override_tune_string)
12301     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
12302                                   &aarch64_tune_params);
12303
12304   /* This target defaults to strict volatile bitfields.  */
12305   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
12306     opts->x_flag_strict_volatile_bitfields = 1;
12307
12308   if (aarch64_stack_protector_guard == SSP_GLOBAL
12309       && opts->x_aarch64_stack_protector_guard_offset_str)
12310     {
12311       error ("incompatible options %<-mstack-protector-guard=global%> and "
12312              "%<-mstack-protector-guard-offset=%s%>",
12313              aarch64_stack_protector_guard_offset_str);
12314     }
12315
12316   if (aarch64_stack_protector_guard == SSP_SYSREG
12317       && !(opts->x_aarch64_stack_protector_guard_offset_str
12318            && opts->x_aarch64_stack_protector_guard_reg_str))
12319     {
12320       error ("both %<-mstack-protector-guard-offset%> and "
12321              "%<-mstack-protector-guard-reg%> must be used "
12322              "with %<-mstack-protector-guard=sysreg%>");
12323     }
12324
12325   if (opts->x_aarch64_stack_protector_guard_reg_str)
12326     {
12327       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
12328           error ("specify a system register with a small string length.");
12329     }
12330
12331   if (opts->x_aarch64_stack_protector_guard_offset_str)
12332     {
12333       char *end;
12334       const char *str = aarch64_stack_protector_guard_offset_str;
12335       errno = 0;
12336       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
12337       if (!*str || *end || errno)
12338         error ("%qs is not a valid offset in %qs", str,
12339                "-mstack-protector-guard-offset=");
12340       aarch64_stack_protector_guard_offset = offs;
12341     }
12342
12343   initialize_aarch64_code_model (opts);
12344   initialize_aarch64_tls_size (opts);
12345
12346   int queue_depth = 0;
12347   switch (aarch64_tune_params.autoprefetcher_model)
12348     {
12349       case tune_params::AUTOPREFETCHER_OFF:
12350         queue_depth = -1;
12351         break;
12352       case tune_params::AUTOPREFETCHER_WEAK:
12353         queue_depth = 0;
12354         break;
12355       case tune_params::AUTOPREFETCHER_STRONG:
12356         queue_depth = max_insn_queue_index + 1;
12357         break;
12358       default:
12359         gcc_unreachable ();
12360     }
12361
12362   /* We don't mind passing in global_options_set here as we don't use
12363      the *options_set structs anyway.  */
12364   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
12365                          queue_depth,
12366                          opts->x_param_values,
12367                          global_options_set.x_param_values);
12368
12369   /* Set up parameters to be used in prefetching algorithm.  Do not
12370      override the defaults unless we are tuning for a core we have
12371      researched values for.  */
12372   if (aarch64_tune_params.prefetch->num_slots > 0)
12373     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
12374                            aarch64_tune_params.prefetch->num_slots,
12375                            opts->x_param_values,
12376                            global_options_set.x_param_values);
12377   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
12378     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
12379                            aarch64_tune_params.prefetch->l1_cache_size,
12380                            opts->x_param_values,
12381                            global_options_set.x_param_values);
12382   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
12383     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
12384                            aarch64_tune_params.prefetch->l1_cache_line_size,
12385                            opts->x_param_values,
12386                            global_options_set.x_param_values);
12387   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
12388     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
12389                            aarch64_tune_params.prefetch->l2_cache_size,
12390                            opts->x_param_values,
12391                            global_options_set.x_param_values);
12392   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
12393     maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
12394                            0,
12395                            opts->x_param_values,
12396                            global_options_set.x_param_values);
12397   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
12398     maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
12399                            aarch64_tune_params.prefetch->minimum_stride,
12400                            opts->x_param_values,
12401                            global_options_set.x_param_values);
12402
12403   /* Use the alternative scheduling-pressure algorithm by default.  */
12404   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
12405                          opts->x_param_values,
12406                          global_options_set.x_param_values);
12407
12408   /* If the user hasn't changed it via configure then set the default to 64 KB
12409      for the backend.  */
12410   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
12411                          DEFAULT_STK_CLASH_GUARD_SIZE == 0
12412                            ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
12413                          opts->x_param_values,
12414                          global_options_set.x_param_values);
12415
12416   /* Validate the guard size.  */
12417   int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
12418
12419   /* Enforce that interval is the same size as size so the mid-end does the
12420      right thing.  */
12421   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
12422                          guard_size,
12423                          opts->x_param_values,
12424                          global_options_set.x_param_values);
12425
12426   /* The maybe_set calls won't update the value if the user has explicitly set
12427      one.  Which means we need to validate that probing interval and guard size
12428      are equal.  */
12429   int probe_interval
12430     = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
12431   if (guard_size != probe_interval)
12432     error ("stack clash guard size %<%d%> must be equal to probing interval "
12433            "%<%d%>", guard_size, probe_interval);
12434
12435   /* Enable sw prefetching at specified optimization level for
12436      CPUS that have prefetch.  Lower optimization level threshold by 1
12437      when profiling is enabled.  */
12438   if (opts->x_flag_prefetch_loop_arrays < 0
12439       && !opts->x_optimize_size
12440       && aarch64_tune_params.prefetch->default_opt_level >= 0
12441       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
12442     opts->x_flag_prefetch_loop_arrays = 1;
12443
12444   if (opts->x_aarch64_arch_string == NULL)
12445     opts->x_aarch64_arch_string = selected_arch->name;
12446   if (opts->x_aarch64_cpu_string == NULL)
12447     opts->x_aarch64_cpu_string = selected_cpu->name;
12448   if (opts->x_aarch64_tune_string == NULL)
12449     opts->x_aarch64_tune_string = selected_tune->name;
12450
12451   aarch64_override_options_after_change_1 (opts);
12452 }
12453
12454 /* Print a hint with a suggestion for a core or architecture name that
12455    most closely resembles what the user passed in STR.  ARCH is true if
12456    the user is asking for an architecture name.  ARCH is false if the user
12457    is asking for a core name.  */
12458
12459 static void
12460 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
12461 {
12462   auto_vec<const char *> candidates;
12463   const struct processor *entry = arch ? all_architectures : all_cores;
12464   for (; entry->name != NULL; entry++)
12465     candidates.safe_push (entry->name);
12466
12467 #ifdef HAVE_LOCAL_CPU_DETECT
12468   /* Add also "native" as possible value.  */
12469   if (arch)
12470     candidates.safe_push ("native");
12471 #endif
12472
12473   char *s;
12474   const char *hint = candidates_list_and_hint (str, s, candidates);
12475   if (hint)
12476     inform (input_location, "valid arguments are: %s;"
12477                              " did you mean %qs?", s, hint);
12478   else
12479     inform (input_location, "valid arguments are: %s", s);
12480
12481   XDELETEVEC (s);
12482 }
12483
12484 /* Print a hint with a suggestion for a core name that most closely resembles
12485    what the user passed in STR.  */
12486
12487 inline static void
12488 aarch64_print_hint_for_core (const char *str)
12489 {
12490   aarch64_print_hint_for_core_or_arch (str, false);
12491 }
12492
12493 /* Print a hint with a suggestion for an architecture name that most closely
12494    resembles what the user passed in STR.  */
12495
12496 inline static void
12497 aarch64_print_hint_for_arch (const char *str)
12498 {
12499   aarch64_print_hint_for_core_or_arch (str, true);
12500 }
12501
12502
12503 /* Print a hint with a suggestion for an extension name
12504    that most closely resembles what the user passed in STR.  */
12505
12506 void
12507 aarch64_print_hint_for_extensions (const std::string &str)
12508 {
12509   auto_vec<const char *> candidates;
12510   aarch64_get_all_extension_candidates (&candidates);
12511   char *s;
12512   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
12513   if (hint)
12514     inform (input_location, "valid arguments are: %s;"
12515                              " did you mean %qs?", s, hint);
12516   else
12517     inform (input_location, "valid arguments are: %s;", s);
12518
12519   XDELETEVEC (s);
12520 }
12521
12522 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
12523    specified in STR and throw errors if appropriate.  Put the results if
12524    they are valid in RES and ISA_FLAGS.  Return whether the option is
12525    valid.  */
12526
12527 static bool
12528 aarch64_validate_mcpu (const char *str, const struct processor **res,
12529                        uint64_t *isa_flags)
12530 {
12531   std::string invalid_extension;
12532   enum aarch64_parse_opt_result parse_res
12533     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
12534
12535   if (parse_res == AARCH64_PARSE_OK)
12536     return true;
12537
12538   switch (parse_res)
12539     {
12540       case AARCH64_PARSE_MISSING_ARG:
12541         error ("missing cpu name in %<-mcpu=%s%>", str);
12542         break;
12543       case AARCH64_PARSE_INVALID_ARG:
12544         error ("unknown value %qs for %<-mcpu%>", str);
12545         aarch64_print_hint_for_core (str);
12546         break;
12547       case AARCH64_PARSE_INVALID_FEATURE:
12548         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
12549                invalid_extension.c_str (), str);
12550         aarch64_print_hint_for_extensions (invalid_extension);
12551         break;
12552       default:
12553         gcc_unreachable ();
12554     }
12555
12556   return false;
12557 }
12558
12559 /* Parses CONST_STR for branch protection features specified in
12560    aarch64_branch_protect_types, and set any global variables required.  Returns
12561    the parsing result and assigns LAST_STR to the last processed token from
12562    CONST_STR so that it can be used for error reporting.  */
12563
12564 static enum
12565 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
12566                                                           char** last_str)
12567 {
12568   char *str_root = xstrdup (const_str);
12569   char* token_save = NULL;
12570   char *str = strtok_r (str_root, "+", &token_save);
12571   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
12572   if (!str)
12573     res = AARCH64_PARSE_MISSING_ARG;
12574   else
12575     {
12576       char *next_str = strtok_r (NULL, "+", &token_save);
12577       /* Reset the branch protection features to their defaults.  */
12578       aarch64_handle_no_branch_protection (NULL, NULL);
12579
12580       while (str && res == AARCH64_PARSE_OK)
12581         {
12582           const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
12583           bool found = false;
12584           /* Search for this type.  */
12585           while (type && type->name && !found && res == AARCH64_PARSE_OK)
12586             {
12587               if (strcmp (str, type->name) == 0)
12588                 {
12589                   found = true;
12590                   res = type->handler (str, next_str);
12591                   str = next_str;
12592                   next_str = strtok_r (NULL, "+", &token_save);
12593                 }
12594               else
12595                 type++;
12596             }
12597           if (found && res == AARCH64_PARSE_OK)
12598             {
12599               bool found_subtype = true;
12600               /* Loop through each token until we find one that isn't a
12601                  subtype.  */
12602               while (found_subtype)
12603                 {
12604                   found_subtype = false;
12605                   const aarch64_branch_protect_type *subtype = type->subtypes;
12606                   /* Search for the subtype.  */
12607                   while (str && subtype && subtype->name && !found_subtype
12608                           && res == AARCH64_PARSE_OK)
12609                     {
12610                       if (strcmp (str, subtype->name) == 0)
12611                         {
12612                           found_subtype = true;
12613                           res = subtype->handler (str, next_str);
12614                           str = next_str;
12615                           next_str = strtok_r (NULL, "+", &token_save);
12616                         }
12617                       else
12618                         subtype++;
12619                     }
12620                 }
12621             }
12622           else if (!found)
12623             res = AARCH64_PARSE_INVALID_ARG;
12624         }
12625     }
12626   /* Copy the last processed token into the argument to pass it back.
12627     Used by option and attribute validation to print the offending token.  */
12628   if (last_str)
12629     {
12630       if (str) strcpy (*last_str, str);
12631       else *last_str = NULL;
12632     }
12633   if (res == AARCH64_PARSE_OK)
12634     {
12635       /* If needed, alloc the accepted string then copy in const_str.
12636         Used by override_option_after_change_1.  */
12637       if (!accepted_branch_protection_string)
12638         accepted_branch_protection_string = (char *) xmalloc (
12639                                                       BRANCH_PROTECT_STR_MAX
12640                                                         + 1);
12641       strncpy (accepted_branch_protection_string, const_str,
12642                 BRANCH_PROTECT_STR_MAX + 1);
12643       /* Forcibly null-terminate.  */
12644       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
12645     }
12646   return res;
12647 }
12648
12649 static bool
12650 aarch64_validate_mbranch_protection (const char *const_str)
12651 {
12652   char *str = (char *) xmalloc (strlen (const_str));
12653   enum aarch64_parse_opt_result res =
12654     aarch64_parse_branch_protection (const_str, &str);
12655   if (res == AARCH64_PARSE_INVALID_ARG)
12656     error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
12657   else if (res == AARCH64_PARSE_MISSING_ARG)
12658     error ("missing argument for %<-mbranch-protection=%>");
12659   free (str);
12660   return res == AARCH64_PARSE_OK;
12661 }
12662
12663 /* Validate a command-line -march option.  Parse the arch and extensions
12664    (if any) specified in STR and throw errors if appropriate.  Put the
12665    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
12666    option is valid.  */
12667
12668 static bool
12669 aarch64_validate_march (const char *str, const struct processor **res,
12670                          uint64_t *isa_flags)
12671 {
12672   std::string invalid_extension;
12673   enum aarch64_parse_opt_result parse_res
12674     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
12675
12676   if (parse_res == AARCH64_PARSE_OK)
12677     return true;
12678
12679   switch (parse_res)
12680     {
12681       case AARCH64_PARSE_MISSING_ARG:
12682         error ("missing arch name in %<-march=%s%>", str);
12683         break;
12684       case AARCH64_PARSE_INVALID_ARG:
12685         error ("unknown value %qs for %<-march%>", str);
12686         aarch64_print_hint_for_arch (str);
12687         break;
12688       case AARCH64_PARSE_INVALID_FEATURE:
12689         error ("invalid feature modifier %qs in %<-march=%s%>",
12690                invalid_extension.c_str (), str);
12691         aarch64_print_hint_for_extensions (invalid_extension);
12692         break;
12693       default:
12694         gcc_unreachable ();
12695     }
12696
12697   return false;
12698 }
12699
12700 /* Validate a command-line -mtune option.  Parse the cpu
12701    specified in STR and throw errors if appropriate.  Put the
12702    result, if it is valid, in RES.  Return whether the option is
12703    valid.  */
12704
12705 static bool
12706 aarch64_validate_mtune (const char *str, const struct processor **res)
12707 {
12708   enum aarch64_parse_opt_result parse_res
12709     = aarch64_parse_tune (str, res);
12710
12711   if (parse_res == AARCH64_PARSE_OK)
12712     return true;
12713
12714   switch (parse_res)
12715     {
12716       case AARCH64_PARSE_MISSING_ARG:
12717         error ("missing cpu name in %<-mtune=%s%>", str);
12718         break;
12719       case AARCH64_PARSE_INVALID_ARG:
12720         error ("unknown value %qs for %<-mtune%>", str);
12721         aarch64_print_hint_for_core (str);
12722         break;
12723       default:
12724         gcc_unreachable ();
12725     }
12726   return false;
12727 }
12728
12729 /* Return the CPU corresponding to the enum CPU.
12730    If it doesn't specify a cpu, return the default.  */
12731
12732 static const struct processor *
12733 aarch64_get_tune_cpu (enum aarch64_processor cpu)
12734 {
12735   if (cpu != aarch64_none)
12736     return &all_cores[cpu];
12737
12738   /* The & 0x3f is to extract the bottom 6 bits that encode the
12739      default cpu as selected by the --with-cpu GCC configure option
12740      in config.gcc.
12741      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
12742      flags mechanism should be reworked to make it more sane.  */
12743   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12744 }
12745
12746 /* Return the architecture corresponding to the enum ARCH.
12747    If it doesn't specify a valid architecture, return the default.  */
12748
12749 static const struct processor *
12750 aarch64_get_arch (enum aarch64_arch arch)
12751 {
12752   if (arch != aarch64_no_arch)
12753     return &all_architectures[arch];
12754
12755   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12756
12757   return &all_architectures[cpu->arch];
12758 }
12759
12760 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
12761
12762 static poly_uint16
12763 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
12764 {
12765   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
12766      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
12767      deciding which .md file patterns to use and when deciding whether
12768      something is a legitimate address or constant.  */
12769   if (value == SVE_SCALABLE || value == SVE_128)
12770     return poly_uint16 (2, 2);
12771   else
12772     return (int) value / 64;
12773 }
12774
12775 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
12776    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
12777    tuning structs.  In particular it must set selected_tune and
12778    aarch64_isa_flags that define the available ISA features and tuning
12779    decisions.  It must also set selected_arch as this will be used to
12780    output the .arch asm tags for each function.  */
12781
12782 static void
12783 aarch64_override_options (void)
12784 {
12785   uint64_t cpu_isa = 0;
12786   uint64_t arch_isa = 0;
12787   aarch64_isa_flags = 0;
12788
12789   bool valid_cpu = true;
12790   bool valid_tune = true;
12791   bool valid_arch = true;
12792
12793   selected_cpu = NULL;
12794   selected_arch = NULL;
12795   selected_tune = NULL;
12796
12797   if (aarch64_branch_protection_string)
12798     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
12799
12800   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
12801      If either of -march or -mtune is given, they override their
12802      respective component of -mcpu.  */
12803   if (aarch64_cpu_string)
12804     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
12805                                         &cpu_isa);
12806
12807   if (aarch64_arch_string)
12808     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
12809                                           &arch_isa);
12810
12811   if (aarch64_tune_string)
12812     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
12813
12814 #ifdef SUBTARGET_OVERRIDE_OPTIONS
12815   SUBTARGET_OVERRIDE_OPTIONS;
12816 #endif
12817
12818   /* If the user did not specify a processor, choose the default
12819      one for them.  This will be the CPU set during configuration using
12820      --with-cpu, otherwise it is "generic".  */
12821   if (!selected_cpu)
12822     {
12823       if (selected_arch)
12824         {
12825           selected_cpu = &all_cores[selected_arch->ident];
12826           aarch64_isa_flags = arch_isa;
12827           explicit_arch = selected_arch->arch;
12828         }
12829       else
12830         {
12831           /* Get default configure-time CPU.  */
12832           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
12833           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
12834         }
12835
12836       if (selected_tune)
12837         explicit_tune_core = selected_tune->ident;
12838     }
12839   /* If both -mcpu and -march are specified check that they are architecturally
12840      compatible, warn if they're not and prefer the -march ISA flags.  */
12841   else if (selected_arch)
12842     {
12843       if (selected_arch->arch != selected_cpu->arch)
12844         {
12845           warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
12846                        all_architectures[selected_cpu->arch].name,
12847                        selected_arch->name);
12848         }
12849       aarch64_isa_flags = arch_isa;
12850       explicit_arch = selected_arch->arch;
12851       explicit_tune_core = selected_tune ? selected_tune->ident
12852                                           : selected_cpu->ident;
12853     }
12854   else
12855     {
12856       /* -mcpu but no -march.  */
12857       aarch64_isa_flags = cpu_isa;
12858       explicit_tune_core = selected_tune ? selected_tune->ident
12859                                           : selected_cpu->ident;
12860       gcc_assert (selected_cpu);
12861       selected_arch = &all_architectures[selected_cpu->arch];
12862       explicit_arch = selected_arch->arch;
12863     }
12864
12865   /* Set the arch as well as we will need it when outputing
12866      the .arch directive in assembly.  */
12867   if (!selected_arch)
12868     {
12869       gcc_assert (selected_cpu);
12870       selected_arch = &all_architectures[selected_cpu->arch];
12871     }
12872
12873   if (!selected_tune)
12874     selected_tune = selected_cpu;
12875
12876   if (aarch64_enable_bti == 2)
12877     {
12878 #ifdef TARGET_ENABLE_BTI
12879       aarch64_enable_bti = 1;
12880 #else
12881       aarch64_enable_bti = 0;
12882 #endif
12883     }
12884
12885   /* Return address signing is currently not supported for ILP32 targets.  For
12886      LP64 targets use the configured option in the absence of a command-line
12887      option for -mbranch-protection.  */
12888   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
12889     {
12890 #ifdef TARGET_ENABLE_PAC_RET
12891       aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
12892 #else
12893       aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
12894 #endif
12895     }
12896
12897 #ifndef HAVE_AS_MABI_OPTION
12898   /* The compiler may have been configured with 2.23.* binutils, which does
12899      not have support for ILP32.  */
12900   if (TARGET_ILP32)
12901     error ("assembler does not support %<-mabi=ilp32%>");
12902 #endif
12903
12904   /* Convert -msve-vector-bits to a VG count.  */
12905   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
12906
12907   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
12908     sorry ("return address signing is only supported for %<-mabi=lp64%>");
12909
12910   /* Make sure we properly set up the explicit options.  */
12911   if ((aarch64_cpu_string && valid_cpu)
12912        || (aarch64_tune_string && valid_tune))
12913     gcc_assert (explicit_tune_core != aarch64_none);
12914
12915   if ((aarch64_cpu_string && valid_cpu)
12916        || (aarch64_arch_string && valid_arch))
12917     gcc_assert (explicit_arch != aarch64_no_arch);
12918
12919   /* The pass to insert speculation tracking runs before
12920      shrink-wrapping and the latter does not know how to update the
12921      tracking status.  So disable it in this case.  */
12922   if (aarch64_track_speculation)
12923     flag_shrink_wrap = 0;
12924
12925   aarch64_override_options_internal (&global_options);
12926
12927   /* Save these options as the default ones in case we push and pop them later
12928      while processing functions with potential target attributes.  */
12929   target_option_default_node = target_option_current_node
12930       = build_target_option_node (&global_options);
12931 }
12932
12933 /* Implement targetm.override_options_after_change.  */
12934
12935 static void
12936 aarch64_override_options_after_change (void)
12937 {
12938   aarch64_override_options_after_change_1 (&global_options);
12939 }
12940
12941 static struct machine_function *
12942 aarch64_init_machine_status (void)
12943 {
12944   struct machine_function *machine;
12945   machine = ggc_cleared_alloc<machine_function> ();
12946   return machine;
12947 }
12948
12949 void
12950 aarch64_init_expanders (void)
12951 {
12952   init_machine_status = aarch64_init_machine_status;
12953 }
12954
12955 /* A checking mechanism for the implementation of the various code models.  */
12956 static void
12957 initialize_aarch64_code_model (struct gcc_options *opts)
12958 {
12959    if (opts->x_flag_pic)
12960      {
12961        switch (opts->x_aarch64_cmodel_var)
12962          {
12963          case AARCH64_CMODEL_TINY:
12964            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
12965            break;
12966          case AARCH64_CMODEL_SMALL:
12967 #ifdef HAVE_AS_SMALL_PIC_RELOCS
12968            aarch64_cmodel = (flag_pic == 2
12969                              ? AARCH64_CMODEL_SMALL_PIC
12970                              : AARCH64_CMODEL_SMALL_SPIC);
12971 #else
12972            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
12973 #endif
12974            break;
12975          case AARCH64_CMODEL_LARGE:
12976            sorry ("code model %qs with %<-f%s%>", "large",
12977                   opts->x_flag_pic > 1 ? "PIC" : "pic");
12978            break;
12979          default:
12980            gcc_unreachable ();
12981          }
12982      }
12983    else
12984      aarch64_cmodel = opts->x_aarch64_cmodel_var;
12985 }
12986
12987 /* Implement TARGET_OPTION_SAVE.  */
12988
12989 static void
12990 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
12991 {
12992   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
12993   ptr->x_aarch64_branch_protection_string
12994     = opts->x_aarch64_branch_protection_string;
12995 }
12996
12997 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
12998    using the information saved in PTR.  */
12999
13000 static void
13001 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
13002 {
13003   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
13004   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
13005   opts->x_explicit_arch = ptr->x_explicit_arch;
13006   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
13007   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
13008   opts->x_aarch64_branch_protection_string
13009     = ptr->x_aarch64_branch_protection_string;
13010   if (opts->x_aarch64_branch_protection_string)
13011     {
13012       aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
13013                                         NULL);
13014     }
13015
13016   aarch64_override_options_internal (opts);
13017 }
13018
13019 /* Implement TARGET_OPTION_PRINT.  */
13020
13021 static void
13022 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
13023 {
13024   const struct processor *cpu
13025     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
13026   uint64_t isa_flags = ptr->x_aarch64_isa_flags;
13027   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
13028   std::string extension
13029     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
13030
13031   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
13032   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
13033            arch->name, extension.c_str ());
13034 }
13035
13036 static GTY(()) tree aarch64_previous_fndecl;
13037
13038 void
13039 aarch64_reset_previous_fndecl (void)
13040 {
13041   aarch64_previous_fndecl = NULL;
13042 }
13043
13044 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
13045    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
13046    make sure optab availability predicates are recomputed when necessary.  */
13047
13048 void
13049 aarch64_save_restore_target_globals (tree new_tree)
13050 {
13051   if (TREE_TARGET_GLOBALS (new_tree))
13052     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
13053   else if (new_tree == target_option_default_node)
13054     restore_target_globals (&default_target_globals);
13055   else
13056     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
13057 }
13058
13059 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
13060    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
13061    of the function, if such exists.  This function may be called multiple
13062    times on a single function so use aarch64_previous_fndecl to avoid
13063    setting up identical state.  */
13064
13065 static void
13066 aarch64_set_current_function (tree fndecl)
13067 {
13068   if (!fndecl || fndecl == aarch64_previous_fndecl)
13069     return;
13070
13071   tree old_tree = (aarch64_previous_fndecl
13072                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
13073                    : NULL_TREE);
13074
13075   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13076
13077   /* If current function has no attributes but the previous one did,
13078      use the default node.  */
13079   if (!new_tree && old_tree)
13080     new_tree = target_option_default_node;
13081
13082   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
13083      the default have been handled by aarch64_save_restore_target_globals from
13084      aarch64_pragma_target_parse.  */
13085   if (old_tree == new_tree)
13086     return;
13087
13088   aarch64_previous_fndecl = fndecl;
13089
13090   /* First set the target options.  */
13091   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
13092
13093   aarch64_save_restore_target_globals (new_tree);
13094 }
13095
13096 /* Enum describing the various ways we can handle attributes.
13097    In many cases we can reuse the generic option handling machinery.  */
13098
13099 enum aarch64_attr_opt_type
13100 {
13101   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
13102   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
13103   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
13104   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
13105 };
13106
13107 /* All the information needed to handle a target attribute.
13108    NAME is the name of the attribute.
13109    ATTR_TYPE specifies the type of behavior of the attribute as described
13110    in the definition of enum aarch64_attr_opt_type.
13111    ALLOW_NEG is true if the attribute supports a "no-" form.
13112    HANDLER is the function that takes the attribute string as an argument
13113    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
13114    OPT_NUM is the enum specifying the option that the attribute modifies.
13115    This is needed for attributes that mirror the behavior of a command-line
13116    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
13117    aarch64_attr_enum.  */
13118
13119 struct aarch64_attribute_info
13120 {
13121   const char *name;
13122   enum aarch64_attr_opt_type attr_type;
13123   bool allow_neg;
13124   bool (*handler) (const char *);
13125   enum opt_code opt_num;
13126 };
13127
13128 /* Handle the ARCH_STR argument to the arch= target attribute.  */
13129
13130 static bool
13131 aarch64_handle_attr_arch (const char *str)
13132 {
13133   const struct processor *tmp_arch = NULL;
13134   std::string invalid_extension;
13135   enum aarch64_parse_opt_result parse_res
13136     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
13137
13138   if (parse_res == AARCH64_PARSE_OK)
13139     {
13140       gcc_assert (tmp_arch);
13141       selected_arch = tmp_arch;
13142       explicit_arch = selected_arch->arch;
13143       return true;
13144     }
13145
13146   switch (parse_res)
13147     {
13148       case AARCH64_PARSE_MISSING_ARG:
13149         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
13150         break;
13151       case AARCH64_PARSE_INVALID_ARG:
13152         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
13153         aarch64_print_hint_for_arch (str);
13154         break;
13155       case AARCH64_PARSE_INVALID_FEATURE:
13156         error ("invalid feature modifier %s of value (\"%s\") in "
13157                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13158         aarch64_print_hint_for_extensions (invalid_extension);
13159         break;
13160       default:
13161         gcc_unreachable ();
13162     }
13163
13164   return false;
13165 }
13166
13167 /* Handle the argument CPU_STR to the cpu= target attribute.  */
13168
13169 static bool
13170 aarch64_handle_attr_cpu (const char *str)
13171 {
13172   const struct processor *tmp_cpu = NULL;
13173   std::string invalid_extension;
13174   enum aarch64_parse_opt_result parse_res
13175     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
13176
13177   if (parse_res == AARCH64_PARSE_OK)
13178     {
13179       gcc_assert (tmp_cpu);
13180       selected_tune = tmp_cpu;
13181       explicit_tune_core = selected_tune->ident;
13182
13183       selected_arch = &all_architectures[tmp_cpu->arch];
13184       explicit_arch = selected_arch->arch;
13185       return true;
13186     }
13187
13188   switch (parse_res)
13189     {
13190       case AARCH64_PARSE_MISSING_ARG:
13191         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
13192         break;
13193       case AARCH64_PARSE_INVALID_ARG:
13194         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
13195         aarch64_print_hint_for_core (str);
13196         break;
13197       case AARCH64_PARSE_INVALID_FEATURE:
13198         error ("invalid feature modifier %s of value (\"%s\") in "
13199                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13200         aarch64_print_hint_for_extensions (invalid_extension);
13201         break;
13202       default:
13203         gcc_unreachable ();
13204     }
13205
13206   return false;
13207 }
13208
13209 /* Handle the argument STR to the branch-protection= attribute.  */
13210
13211  static bool
13212  aarch64_handle_attr_branch_protection (const char* str)
13213  {
13214   char *err_str = (char *) xmalloc (strlen (str));
13215   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
13216                                                                       &err_str);
13217   bool success = false;
13218   switch (res)
13219     {
13220      case AARCH64_PARSE_MISSING_ARG:
13221        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
13222               " attribute");
13223        break;
13224      case AARCH64_PARSE_INVALID_ARG:
13225        error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
13226               "=\")%> pragma or attribute", err_str);
13227        break;
13228      case AARCH64_PARSE_OK:
13229        success = true;
13230       /* Fall through.  */
13231      case AARCH64_PARSE_INVALID_FEATURE:
13232        break;
13233      default:
13234        gcc_unreachable ();
13235     }
13236   free (err_str);
13237   return success;
13238  }
13239
13240 /* Handle the argument STR to the tune= target attribute.  */
13241
13242 static bool
13243 aarch64_handle_attr_tune (const char *str)
13244 {
13245   const struct processor *tmp_tune = NULL;
13246   enum aarch64_parse_opt_result parse_res
13247     = aarch64_parse_tune (str, &tmp_tune);
13248
13249   if (parse_res == AARCH64_PARSE_OK)
13250     {
13251       gcc_assert (tmp_tune);
13252       selected_tune = tmp_tune;
13253       explicit_tune_core = selected_tune->ident;
13254       return true;
13255     }
13256
13257   switch (parse_res)
13258     {
13259       case AARCH64_PARSE_INVALID_ARG:
13260         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
13261         aarch64_print_hint_for_core (str);
13262         break;
13263       default:
13264         gcc_unreachable ();
13265     }
13266
13267   return false;
13268 }
13269
13270 /* Parse an architecture extensions target attribute string specified in STR.
13271    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
13272    if successful.  Update aarch64_isa_flags to reflect the ISA features
13273    modified.  */
13274
13275 static bool
13276 aarch64_handle_attr_isa_flags (char *str)
13277 {
13278   enum aarch64_parse_opt_result parse_res;
13279   uint64_t isa_flags = aarch64_isa_flags;
13280
13281   /* We allow "+nothing" in the beginning to clear out all architectural
13282      features if the user wants to handpick specific features.  */
13283   if (strncmp ("+nothing", str, 8) == 0)
13284     {
13285       isa_flags = 0;
13286       str += 8;
13287     }
13288
13289   std::string invalid_extension;
13290   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
13291
13292   if (parse_res == AARCH64_PARSE_OK)
13293     {
13294       aarch64_isa_flags = isa_flags;
13295       return true;
13296     }
13297
13298   switch (parse_res)
13299     {
13300       case AARCH64_PARSE_MISSING_ARG:
13301         error ("missing value in %<target()%> pragma or attribute");
13302         break;
13303
13304       case AARCH64_PARSE_INVALID_FEATURE:
13305         error ("invalid feature modifier %s of value (\"%s\") in "
13306                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13307         break;
13308
13309       default:
13310         gcc_unreachable ();
13311     }
13312
13313  return false;
13314 }
13315
13316 /* The target attributes that we support.  On top of these we also support just
13317    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
13318    handled explicitly in aarch64_process_one_target_attr.  */
13319
13320 static const struct aarch64_attribute_info aarch64_attributes[] =
13321 {
13322   { "general-regs-only", aarch64_attr_mask, false, NULL,
13323      OPT_mgeneral_regs_only },
13324   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
13325      OPT_mfix_cortex_a53_835769 },
13326   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
13327      OPT_mfix_cortex_a53_843419 },
13328   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
13329   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
13330   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
13331      OPT_momit_leaf_frame_pointer },
13332   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
13333   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
13334      OPT_march_ },
13335   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
13336   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
13337      OPT_mtune_ },
13338   { "branch-protection", aarch64_attr_custom, false,
13339      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
13340   { "sign-return-address", aarch64_attr_enum, false, NULL,
13341      OPT_msign_return_address_ },
13342   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
13343 };
13344
13345 /* Parse ARG_STR which contains the definition of one target attribute.
13346    Show appropriate errors if any or return true if the attribute is valid.  */
13347
13348 static bool
13349 aarch64_process_one_target_attr (char *arg_str)
13350 {
13351   bool invert = false;
13352
13353   size_t len = strlen (arg_str);
13354
13355   if (len == 0)
13356     {
13357       error ("malformed %<target()%> pragma or attribute");
13358       return false;
13359     }
13360
13361   char *str_to_check = (char *) alloca (len + 1);
13362   strcpy (str_to_check, arg_str);
13363
13364   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
13365      It is easier to detect and handle it explicitly here rather than going
13366      through the machinery for the rest of the target attributes in this
13367      function.  */
13368   if (*str_to_check == '+')
13369     return aarch64_handle_attr_isa_flags (str_to_check);
13370
13371   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
13372     {
13373       invert = true;
13374       str_to_check += 3;
13375     }
13376   char *arg = strchr (str_to_check, '=');
13377
13378   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
13379      and point ARG to "foo".  */
13380   if (arg)
13381     {
13382       *arg = '\0';
13383       arg++;
13384     }
13385   const struct aarch64_attribute_info *p_attr;
13386   bool found = false;
13387   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
13388     {
13389       /* If the names don't match up, or the user has given an argument
13390          to an attribute that doesn't accept one, or didn't give an argument
13391          to an attribute that expects one, fail to match.  */
13392       if (strcmp (str_to_check, p_attr->name) != 0)
13393         continue;
13394
13395       found = true;
13396       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
13397                               || p_attr->attr_type == aarch64_attr_enum;
13398
13399       if (attr_need_arg_p ^ (arg != NULL))
13400         {
13401           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
13402           return false;
13403         }
13404
13405       /* If the name matches but the attribute does not allow "no-" versions
13406          then we can't match.  */
13407       if (invert && !p_attr->allow_neg)
13408         {
13409           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
13410           return false;
13411         }
13412
13413       switch (p_attr->attr_type)
13414         {
13415         /* Has a custom handler registered.
13416            For example, cpu=, arch=, tune=.  */
13417           case aarch64_attr_custom:
13418             gcc_assert (p_attr->handler);
13419             if (!p_attr->handler (arg))
13420               return false;
13421             break;
13422
13423           /* Either set or unset a boolean option.  */
13424           case aarch64_attr_bool:
13425             {
13426               struct cl_decoded_option decoded;
13427
13428               generate_option (p_attr->opt_num, NULL, !invert,
13429                                CL_TARGET, &decoded);
13430               aarch64_handle_option (&global_options, &global_options_set,
13431                                       &decoded, input_location);
13432               break;
13433             }
13434           /* Set or unset a bit in the target_flags.  aarch64_handle_option
13435              should know what mask to apply given the option number.  */
13436           case aarch64_attr_mask:
13437             {
13438               struct cl_decoded_option decoded;
13439               /* We only need to specify the option number.
13440                  aarch64_handle_option will know which mask to apply.  */
13441               decoded.opt_index = p_attr->opt_num;
13442               decoded.value = !invert;
13443               aarch64_handle_option (&global_options, &global_options_set,
13444                                       &decoded, input_location);
13445               break;
13446             }
13447           /* Use the option setting machinery to set an option to an enum.  */
13448           case aarch64_attr_enum:
13449             {
13450               gcc_assert (arg);
13451               bool valid;
13452               int value;
13453               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
13454                                               &value, CL_TARGET);
13455               if (valid)
13456                 {
13457                   set_option (&global_options, NULL, p_attr->opt_num, value,
13458                               NULL, DK_UNSPECIFIED, input_location,
13459                               global_dc);
13460                 }
13461               else
13462                 {
13463                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
13464                 }
13465               break;
13466             }
13467           default:
13468             gcc_unreachable ();
13469         }
13470     }
13471
13472   /* If we reached here we either have found an attribute and validated
13473      it or didn't match any.  If we matched an attribute but its arguments
13474      were malformed we will have returned false already.  */
13475   return found;
13476 }
13477
13478 /* Count how many times the character C appears in
13479    NULL-terminated string STR.  */
13480
13481 static unsigned int
13482 num_occurences_in_str (char c, char *str)
13483 {
13484   unsigned int res = 0;
13485   while (*str != '\0')
13486     {
13487       if (*str == c)
13488         res++;
13489
13490       str++;
13491     }
13492
13493   return res;
13494 }
13495
13496 /* Parse the tree in ARGS that contains the target attribute information
13497    and update the global target options space.  */
13498
13499 bool
13500 aarch64_process_target_attr (tree args)
13501 {
13502   if (TREE_CODE (args) == TREE_LIST)
13503     {
13504       do
13505         {
13506           tree head = TREE_VALUE (args);
13507           if (head)
13508             {
13509               if (!aarch64_process_target_attr (head))
13510                 return false;
13511             }
13512           args = TREE_CHAIN (args);
13513         } while (args);
13514
13515       return true;
13516     }
13517
13518   if (TREE_CODE (args) != STRING_CST)
13519     {
13520       error ("attribute %<target%> argument not a string");
13521       return false;
13522     }
13523
13524   size_t len = strlen (TREE_STRING_POINTER (args));
13525   char *str_to_check = (char *) alloca (len + 1);
13526   strcpy (str_to_check, TREE_STRING_POINTER (args));
13527
13528   if (len == 0)
13529     {
13530       error ("malformed %<target()%> pragma or attribute");
13531       return false;
13532     }
13533
13534   /* Used to catch empty spaces between commas i.e.
13535      attribute ((target ("attr1,,attr2"))).  */
13536   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
13537
13538   /* Handle multiple target attributes separated by ','.  */
13539   char *token = strtok_r (str_to_check, ",", &str_to_check);
13540
13541   unsigned int num_attrs = 0;
13542   while (token)
13543     {
13544       num_attrs++;
13545       if (!aarch64_process_one_target_attr (token))
13546         {
13547           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
13548           return false;
13549         }
13550
13551       token = strtok_r (NULL, ",", &str_to_check);
13552     }
13553
13554   if (num_attrs != num_commas + 1)
13555     {
13556       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
13557       return false;
13558     }
13559
13560   return true;
13561 }
13562
13563 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
13564    process attribute ((target ("..."))).  */
13565
13566 static bool
13567 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
13568 {
13569   struct cl_target_option cur_target;
13570   bool ret;
13571   tree old_optimize;
13572   tree new_target, new_optimize;
13573   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13574
13575   /* If what we're processing is the current pragma string then the
13576      target option node is already stored in target_option_current_node
13577      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
13578      having to re-parse the string.  This is especially useful to keep
13579      arm_neon.h compile times down since that header contains a lot
13580      of intrinsics enclosed in pragmas.  */
13581   if (!existing_target && args == current_target_pragma)
13582     {
13583       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
13584       return true;
13585     }
13586   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13587
13588   old_optimize = build_optimization_node (&global_options);
13589   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13590
13591   /* If the function changed the optimization levels as well as setting
13592      target options, start with the optimizations specified.  */
13593   if (func_optimize && func_optimize != old_optimize)
13594     cl_optimization_restore (&global_options,
13595                              TREE_OPTIMIZATION (func_optimize));
13596
13597   /* Save the current target options to restore at the end.  */
13598   cl_target_option_save (&cur_target, &global_options);
13599
13600   /* If fndecl already has some target attributes applied to it, unpack
13601      them so that we add this attribute on top of them, rather than
13602      overwriting them.  */
13603   if (existing_target)
13604     {
13605       struct cl_target_option *existing_options
13606         = TREE_TARGET_OPTION (existing_target);
13607
13608       if (existing_options)
13609         cl_target_option_restore (&global_options, existing_options);
13610     }
13611   else
13612     cl_target_option_restore (&global_options,
13613                         TREE_TARGET_OPTION (target_option_current_node));
13614
13615   ret = aarch64_process_target_attr (args);
13616
13617   /* Set up any additional state.  */
13618   if (ret)
13619     {
13620       aarch64_override_options_internal (&global_options);
13621       /* Initialize SIMD builtins if we haven't already.
13622          Set current_target_pragma to NULL for the duration so that
13623          the builtin initialization code doesn't try to tag the functions
13624          being built with the attributes specified by any current pragma, thus
13625          going into an infinite recursion.  */
13626       if (TARGET_SIMD)
13627         {
13628           tree saved_current_target_pragma = current_target_pragma;
13629           current_target_pragma = NULL;
13630           aarch64_init_simd_builtins ();
13631           current_target_pragma = saved_current_target_pragma;
13632         }
13633       new_target = build_target_option_node (&global_options);
13634     }
13635   else
13636     new_target = NULL;
13637
13638   new_optimize = build_optimization_node (&global_options);
13639
13640   if (fndecl && ret)
13641     {
13642       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
13643
13644       if (old_optimize != new_optimize)
13645         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
13646     }
13647
13648   cl_target_option_restore (&global_options, &cur_target);
13649
13650   if (old_optimize != new_optimize)
13651     cl_optimization_restore (&global_options,
13652                              TREE_OPTIMIZATION (old_optimize));
13653   return ret;
13654 }
13655
13656 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
13657    tri-bool options (yes, no, don't care) and the default value is
13658    DEF, determine whether to reject inlining.  */
13659
13660 static bool
13661 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
13662                                      int dont_care, int def)
13663 {
13664   /* If the callee doesn't care, always allow inlining.  */
13665   if (callee == dont_care)
13666     return true;
13667
13668   /* If the caller doesn't care, always allow inlining.  */
13669   if (caller == dont_care)
13670     return true;
13671
13672   /* Otherwise, allow inlining if either the callee and caller values
13673      agree, or if the callee is using the default value.  */
13674   return (callee == caller || callee == def);
13675 }
13676
13677 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
13678    to inline CALLEE into CALLER based on target-specific info.
13679    Make sure that the caller and callee have compatible architectural
13680    features.  Then go through the other possible target attributes
13681    and see if they can block inlining.  Try not to reject always_inline
13682    callees unless they are incompatible architecturally.  */
13683
13684 static bool
13685 aarch64_can_inline_p (tree caller, tree callee)
13686 {
13687   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
13688   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
13689
13690   struct cl_target_option *caller_opts
13691         = TREE_TARGET_OPTION (caller_tree ? caller_tree
13692                                            : target_option_default_node);
13693
13694   struct cl_target_option *callee_opts
13695         = TREE_TARGET_OPTION (callee_tree ? callee_tree
13696                                            : target_option_default_node);
13697
13698   /* Callee's ISA flags should be a subset of the caller's.  */
13699   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
13700        != callee_opts->x_aarch64_isa_flags)
13701     return false;
13702
13703   /* Allow non-strict aligned functions inlining into strict
13704      aligned ones.  */
13705   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
13706        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
13707       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
13708            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
13709     return false;
13710
13711   bool always_inline = lookup_attribute ("always_inline",
13712                                           DECL_ATTRIBUTES (callee));
13713
13714   /* If the architectural features match up and the callee is always_inline
13715      then the other attributes don't matter.  */
13716   if (always_inline)
13717     return true;
13718
13719   if (caller_opts->x_aarch64_cmodel_var
13720       != callee_opts->x_aarch64_cmodel_var)
13721     return false;
13722
13723   if (caller_opts->x_aarch64_tls_dialect
13724       != callee_opts->x_aarch64_tls_dialect)
13725     return false;
13726
13727   /* Honour explicit requests to workaround errata.  */
13728   if (!aarch64_tribools_ok_for_inlining_p (
13729           caller_opts->x_aarch64_fix_a53_err835769,
13730           callee_opts->x_aarch64_fix_a53_err835769,
13731           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
13732     return false;
13733
13734   if (!aarch64_tribools_ok_for_inlining_p (
13735           caller_opts->x_aarch64_fix_a53_err843419,
13736           callee_opts->x_aarch64_fix_a53_err843419,
13737           2, TARGET_FIX_ERR_A53_843419))
13738     return false;
13739
13740   /* If the user explicitly specified -momit-leaf-frame-pointer for the
13741      caller and calle and they don't match up, reject inlining.  */
13742   if (!aarch64_tribools_ok_for_inlining_p (
13743           caller_opts->x_flag_omit_leaf_frame_pointer,
13744           callee_opts->x_flag_omit_leaf_frame_pointer,
13745           2, 1))
13746     return false;
13747
13748   /* If the callee has specific tuning overrides, respect them.  */
13749   if (callee_opts->x_aarch64_override_tune_string != NULL
13750       && caller_opts->x_aarch64_override_tune_string == NULL)
13751     return false;
13752
13753   /* If the user specified tuning override strings for the
13754      caller and callee and they don't match up, reject inlining.
13755      We just do a string compare here, we don't analyze the meaning
13756      of the string, as it would be too costly for little gain.  */
13757   if (callee_opts->x_aarch64_override_tune_string
13758       && caller_opts->x_aarch64_override_tune_string
13759       && (strcmp (callee_opts->x_aarch64_override_tune_string,
13760                   caller_opts->x_aarch64_override_tune_string) != 0))
13761     return false;
13762
13763   return true;
13764 }
13765
13766 /* Return true if SYMBOL_REF X binds locally.  */
13767
13768 static bool
13769 aarch64_symbol_binds_local_p (const_rtx x)
13770 {
13771   return (SYMBOL_REF_DECL (x)
13772           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
13773           : SYMBOL_REF_LOCAL_P (x));
13774 }
13775
13776 /* Return true if SYMBOL_REF X is thread local */
13777 static bool
13778 aarch64_tls_symbol_p (rtx x)
13779 {
13780   if (! TARGET_HAVE_TLS)
13781     return false;
13782
13783   if (GET_CODE (x) != SYMBOL_REF)
13784     return false;
13785
13786   return SYMBOL_REF_TLS_MODEL (x) != 0;
13787 }
13788
13789 /* Classify a TLS symbol into one of the TLS kinds.  */
13790 enum aarch64_symbol_type
13791 aarch64_classify_tls_symbol (rtx x)
13792 {
13793   enum tls_model tls_kind = tls_symbolic_operand_type (x);
13794
13795   switch (tls_kind)
13796     {
13797     case TLS_MODEL_GLOBAL_DYNAMIC:
13798     case TLS_MODEL_LOCAL_DYNAMIC:
13799       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
13800
13801     case TLS_MODEL_INITIAL_EXEC:
13802       switch (aarch64_cmodel)
13803         {
13804         case AARCH64_CMODEL_TINY:
13805         case AARCH64_CMODEL_TINY_PIC:
13806           return SYMBOL_TINY_TLSIE;
13807         default:
13808           return SYMBOL_SMALL_TLSIE;
13809         }
13810
13811     case TLS_MODEL_LOCAL_EXEC:
13812       if (aarch64_tls_size == 12)
13813         return SYMBOL_TLSLE12;
13814       else if (aarch64_tls_size == 24)
13815         return SYMBOL_TLSLE24;
13816       else if (aarch64_tls_size == 32)
13817         return SYMBOL_TLSLE32;
13818       else if (aarch64_tls_size == 48)
13819         return SYMBOL_TLSLE48;
13820       else
13821         gcc_unreachable ();
13822
13823     case TLS_MODEL_EMULATED:
13824     case TLS_MODEL_NONE:
13825       return SYMBOL_FORCE_TO_MEM;
13826
13827     default:
13828       gcc_unreachable ();
13829     }
13830 }
13831
13832 /* Return the correct method for accessing X + OFFSET, where X is either
13833    a SYMBOL_REF or LABEL_REF.  */
13834
13835 enum aarch64_symbol_type
13836 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
13837 {
13838   if (GET_CODE (x) == LABEL_REF)
13839     {
13840       switch (aarch64_cmodel)
13841         {
13842         case AARCH64_CMODEL_LARGE:
13843           return SYMBOL_FORCE_TO_MEM;
13844
13845         case AARCH64_CMODEL_TINY_PIC:
13846         case AARCH64_CMODEL_TINY:
13847           return SYMBOL_TINY_ABSOLUTE;
13848
13849         case AARCH64_CMODEL_SMALL_SPIC:
13850         case AARCH64_CMODEL_SMALL_PIC:
13851         case AARCH64_CMODEL_SMALL:
13852           return SYMBOL_SMALL_ABSOLUTE;
13853
13854         default:
13855           gcc_unreachable ();
13856         }
13857     }
13858
13859   if (GET_CODE (x) == SYMBOL_REF)
13860     {
13861       if (aarch64_tls_symbol_p (x))
13862         return aarch64_classify_tls_symbol (x);
13863
13864       switch (aarch64_cmodel)
13865         {
13866         case AARCH64_CMODEL_TINY:
13867           /* When we retrieve symbol + offset address, we have to make sure
13868              the offset does not cause overflow of the final address.  But
13869              we have no way of knowing the address of symbol at compile time
13870              so we can't accurately say if the distance between the PC and
13871              symbol + offset is outside the addressible range of +/-1M in the
13872              TINY code model.  So we rely on images not being greater than
13873              1M and cap the offset at 1M and anything beyond 1M will have to
13874              be loaded using an alternative mechanism.  Furthermore if the
13875              symbol is a weak reference to something that isn't known to
13876              resolve to a symbol in this module, then force to memory.  */
13877           if ((SYMBOL_REF_WEAK (x)
13878                && !aarch64_symbol_binds_local_p (x))
13879               || !IN_RANGE (offset, -1048575, 1048575))
13880             return SYMBOL_FORCE_TO_MEM;
13881           return SYMBOL_TINY_ABSOLUTE;
13882
13883         case AARCH64_CMODEL_SMALL:
13884           /* Same reasoning as the tiny code model, but the offset cap here is
13885              4G.  */
13886           if ((SYMBOL_REF_WEAK (x)
13887                && !aarch64_symbol_binds_local_p (x))
13888               || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
13889                             HOST_WIDE_INT_C (4294967264)))
13890             return SYMBOL_FORCE_TO_MEM;
13891           return SYMBOL_SMALL_ABSOLUTE;
13892
13893         case AARCH64_CMODEL_TINY_PIC:
13894           if (!aarch64_symbol_binds_local_p (x))
13895             return SYMBOL_TINY_GOT;
13896           return SYMBOL_TINY_ABSOLUTE;
13897
13898         case AARCH64_CMODEL_SMALL_SPIC:
13899         case AARCH64_CMODEL_SMALL_PIC:
13900           if (!aarch64_symbol_binds_local_p (x))
13901             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
13902                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
13903           return SYMBOL_SMALL_ABSOLUTE;
13904
13905         case AARCH64_CMODEL_LARGE:
13906           /* This is alright even in PIC code as the constant
13907              pool reference is always PC relative and within
13908              the same translation unit.  */
13909           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
13910             return SYMBOL_SMALL_ABSOLUTE;
13911           else
13912             return SYMBOL_FORCE_TO_MEM;
13913
13914         default:
13915           gcc_unreachable ();
13916         }
13917     }
13918
13919   /* By default push everything into the constant pool.  */
13920   return SYMBOL_FORCE_TO_MEM;
13921 }
13922
13923 bool
13924 aarch64_constant_address_p (rtx x)
13925 {
13926   return (CONSTANT_P (x) && memory_address_p (DImode, x));
13927 }
13928
13929 bool
13930 aarch64_legitimate_pic_operand_p (rtx x)
13931 {
13932   if (GET_CODE (x) == SYMBOL_REF
13933       || (GET_CODE (x) == CONST
13934           && GET_CODE (XEXP (x, 0)) == PLUS
13935           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
13936      return false;
13937
13938   return true;
13939 }
13940
13941 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
13942    that should be rematerialized rather than spilled.  */
13943
13944 static bool
13945 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
13946 {
13947   /* Support CSE and rematerialization of common constants.  */
13948   if (CONST_INT_P (x)
13949       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
13950       || GET_CODE (x) == CONST_VECTOR)
13951     return true;
13952
13953   /* Do not allow vector struct mode constants for Advanced SIMD.
13954      We could support 0 and -1 easily, but they need support in
13955      aarch64-simd.md.  */
13956   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13957   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13958     return false;
13959
13960   /* Only accept variable-length vector constants if they can be
13961      handled directly.
13962
13963      ??? It would be possible to handle rematerialization of other
13964      constants via secondary reloads.  */
13965   if (vec_flags & VEC_ANY_SVE)
13966     return aarch64_simd_valid_immediate (x, NULL);
13967
13968   if (GET_CODE (x) == HIGH)
13969     x = XEXP (x, 0);
13970
13971   /* Accept polynomial constants that can be calculated by using the
13972      destination of a move as the sole temporary.  Constants that
13973      require a second temporary cannot be rematerialized (they can't be
13974      forced to memory and also aren't legitimate constants).  */
13975   poly_int64 offset;
13976   if (poly_int_rtx_p (x, &offset))
13977     return aarch64_offset_temporaries (false, offset) <= 1;
13978
13979   /* If an offset is being added to something else, we need to allow the
13980      base to be moved into the destination register, meaning that there
13981      are no free temporaries for the offset.  */
13982   x = strip_offset (x, &offset);
13983   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
13984     return false;
13985
13986   /* Do not allow const (plus (anchor_symbol, const_int)).  */
13987   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
13988     return false;
13989
13990   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
13991      so spilling them is better than rematerialization.  */
13992   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
13993     return true;
13994
13995   /* Label references are always constant.  */
13996   if (GET_CODE (x) == LABEL_REF)
13997     return true;
13998
13999   return false;
14000 }
14001
14002 rtx
14003 aarch64_load_tp (rtx target)
14004 {
14005   if (!target
14006       || GET_MODE (target) != Pmode
14007       || !register_operand (target, Pmode))
14008     target = gen_reg_rtx (Pmode);
14009
14010   /* Can return in any reg.  */
14011   emit_insn (gen_aarch64_load_tp_hard (target));
14012   return target;
14013 }
14014
14015 /* On AAPCS systems, this is the "struct __va_list".  */
14016 static GTY(()) tree va_list_type;
14017
14018 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
14019    Return the type to use as __builtin_va_list.
14020
14021    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
14022
14023    struct __va_list
14024    {
14025      void *__stack;
14026      void *__gr_top;
14027      void *__vr_top;
14028      int   __gr_offs;
14029      int   __vr_offs;
14030    };  */
14031
14032 static tree
14033 aarch64_build_builtin_va_list (void)
14034 {
14035   tree va_list_name;
14036   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14037
14038   /* Create the type.  */
14039   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
14040   /* Give it the required name.  */
14041   va_list_name = build_decl (BUILTINS_LOCATION,
14042                              TYPE_DECL,
14043                              get_identifier ("__va_list"),
14044                              va_list_type);
14045   DECL_ARTIFICIAL (va_list_name) = 1;
14046   TYPE_NAME (va_list_type) = va_list_name;
14047   TYPE_STUB_DECL (va_list_type) = va_list_name;
14048
14049   /* Create the fields.  */
14050   f_stack = build_decl (BUILTINS_LOCATION,
14051                         FIELD_DECL, get_identifier ("__stack"),
14052                         ptr_type_node);
14053   f_grtop = build_decl (BUILTINS_LOCATION,
14054                         FIELD_DECL, get_identifier ("__gr_top"),
14055                         ptr_type_node);
14056   f_vrtop = build_decl (BUILTINS_LOCATION,
14057                         FIELD_DECL, get_identifier ("__vr_top"),
14058                         ptr_type_node);
14059   f_groff = build_decl (BUILTINS_LOCATION,
14060                         FIELD_DECL, get_identifier ("__gr_offs"),
14061                         integer_type_node);
14062   f_vroff = build_decl (BUILTINS_LOCATION,
14063                         FIELD_DECL, get_identifier ("__vr_offs"),
14064                         integer_type_node);
14065
14066   /* Tell tree-stdarg pass about our internal offset fields.
14067      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
14068      purpose to identify whether the code is updating va_list internal
14069      offset fields through irregular way.  */
14070   va_list_gpr_counter_field = f_groff;
14071   va_list_fpr_counter_field = f_vroff;
14072
14073   DECL_ARTIFICIAL (f_stack) = 1;
14074   DECL_ARTIFICIAL (f_grtop) = 1;
14075   DECL_ARTIFICIAL (f_vrtop) = 1;
14076   DECL_ARTIFICIAL (f_groff) = 1;
14077   DECL_ARTIFICIAL (f_vroff) = 1;
14078
14079   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
14080   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
14081   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
14082   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
14083   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
14084
14085   TYPE_FIELDS (va_list_type) = f_stack;
14086   DECL_CHAIN (f_stack) = f_grtop;
14087   DECL_CHAIN (f_grtop) = f_vrtop;
14088   DECL_CHAIN (f_vrtop) = f_groff;
14089   DECL_CHAIN (f_groff) = f_vroff;
14090
14091   /* Compute its layout.  */
14092   layout_type (va_list_type);
14093
14094   return va_list_type;
14095 }
14096
14097 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
14098 static void
14099 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
14100 {
14101   const CUMULATIVE_ARGS *cum;
14102   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14103   tree stack, grtop, vrtop, groff, vroff;
14104   tree t;
14105   int gr_save_area_size = cfun->va_list_gpr_size;
14106   int vr_save_area_size = cfun->va_list_fpr_size;
14107   int vr_offset;
14108
14109   cum = &crtl->args.info;
14110   if (cfun->va_list_gpr_size)
14111     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
14112                              cfun->va_list_gpr_size);
14113   if (cfun->va_list_fpr_size)
14114     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
14115                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
14116
14117   if (!TARGET_FLOAT)
14118     {
14119       gcc_assert (cum->aapcs_nvrn == 0);
14120       vr_save_area_size = 0;
14121     }
14122
14123   f_stack = TYPE_FIELDS (va_list_type_node);
14124   f_grtop = DECL_CHAIN (f_stack);
14125   f_vrtop = DECL_CHAIN (f_grtop);
14126   f_groff = DECL_CHAIN (f_vrtop);
14127   f_vroff = DECL_CHAIN (f_groff);
14128
14129   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
14130                   NULL_TREE);
14131   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
14132                   NULL_TREE);
14133   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
14134                   NULL_TREE);
14135   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
14136                   NULL_TREE);
14137   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
14138                   NULL_TREE);
14139
14140   /* Emit code to initialize STACK, which points to the next varargs stack
14141      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
14142      by named arguments.  STACK is 8-byte aligned.  */
14143   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
14144   if (cum->aapcs_stack_size > 0)
14145     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
14146   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
14147   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14148
14149   /* Emit code to initialize GRTOP, the top of the GR save area.
14150      virtual_incoming_args_rtx should have been 16 byte aligned.  */
14151   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
14152   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
14153   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14154
14155   /* Emit code to initialize VRTOP, the top of the VR save area.
14156      This address is gr_save_area_bytes below GRTOP, rounded
14157      down to the next 16-byte boundary.  */
14158   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
14159   vr_offset = ROUND_UP (gr_save_area_size,
14160                         STACK_BOUNDARY / BITS_PER_UNIT);
14161
14162   if (vr_offset)
14163     t = fold_build_pointer_plus_hwi (t, -vr_offset);
14164   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
14165   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14166
14167   /* Emit code to initialize GROFF, the offset from GRTOP of the
14168      next GPR argument.  */
14169   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
14170               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
14171   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14172
14173   /* Likewise emit code to initialize VROFF, the offset from FTOP
14174      of the next VR argument.  */
14175   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
14176               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
14177   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14178 }
14179
14180 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
14181
14182 static tree
14183 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
14184                               gimple_seq *post_p ATTRIBUTE_UNUSED)
14185 {
14186   tree addr;
14187   bool indirect_p;
14188   bool is_ha;           /* is HFA or HVA.  */
14189   bool dw_align;        /* double-word align.  */
14190   machine_mode ag_mode = VOIDmode;
14191   int nregs;
14192   machine_mode mode;
14193
14194   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14195   tree stack, f_top, f_off, off, arg, roundup, on_stack;
14196   HOST_WIDE_INT size, rsize, adjust, align;
14197   tree t, u, cond1, cond2;
14198
14199   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
14200   if (indirect_p)
14201     type = build_pointer_type (type);
14202
14203   mode = TYPE_MODE (type);
14204
14205   f_stack = TYPE_FIELDS (va_list_type_node);
14206   f_grtop = DECL_CHAIN (f_stack);
14207   f_vrtop = DECL_CHAIN (f_grtop);
14208   f_groff = DECL_CHAIN (f_vrtop);
14209   f_vroff = DECL_CHAIN (f_groff);
14210
14211   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
14212                   f_stack, NULL_TREE);
14213   size = int_size_in_bytes (type);
14214
14215   bool abi_break;
14216   align
14217     = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
14218
14219   dw_align = false;
14220   adjust = 0;
14221   if (aarch64_vfp_is_call_or_return_candidate (mode,
14222                                                type,
14223                                                &ag_mode,
14224                                                &nregs,
14225                                                &is_ha))
14226     {
14227       /* No frontends can create types with variable-sized modes, so we
14228          shouldn't be asked to pass or return them.  */
14229       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
14230
14231       /* TYPE passed in fp/simd registers.  */
14232       if (!TARGET_FLOAT)
14233         aarch64_err_no_fpadvsimd (mode);
14234
14235       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
14236                       unshare_expr (valist), f_vrtop, NULL_TREE);
14237       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
14238                       unshare_expr (valist), f_vroff, NULL_TREE);
14239
14240       rsize = nregs * UNITS_PER_VREG;
14241
14242       if (is_ha)
14243         {
14244           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
14245             adjust = UNITS_PER_VREG - ag_size;
14246         }
14247       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14248                && size < UNITS_PER_VREG)
14249         {
14250           adjust = UNITS_PER_VREG - size;
14251         }
14252     }
14253   else
14254     {
14255       /* TYPE passed in general registers.  */
14256       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
14257                       unshare_expr (valist), f_grtop, NULL_TREE);
14258       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
14259                       unshare_expr (valist), f_groff, NULL_TREE);
14260       rsize = ROUND_UP (size, UNITS_PER_WORD);
14261       nregs = rsize / UNITS_PER_WORD;
14262
14263       if (align > 8)
14264         {
14265           if (abi_break && warn_psabi)
14266             inform (input_location, "parameter passing for argument of type "
14267                     "%qT changed in GCC 9.1", type);
14268           dw_align = true;
14269         }
14270
14271       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14272           && size < UNITS_PER_WORD)
14273         {
14274           adjust = UNITS_PER_WORD  - size;
14275         }
14276     }
14277
14278   /* Get a local temporary for the field value.  */
14279   off = get_initialized_tmp_var (f_off, pre_p, NULL);
14280
14281   /* Emit code to branch if off >= 0.  */
14282   t = build2 (GE_EXPR, boolean_type_node, off,
14283               build_int_cst (TREE_TYPE (off), 0));
14284   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
14285
14286   if (dw_align)
14287     {
14288       /* Emit: offs = (offs + 15) & -16.  */
14289       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14290                   build_int_cst (TREE_TYPE (off), 15));
14291       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
14292                   build_int_cst (TREE_TYPE (off), -16));
14293       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
14294     }
14295   else
14296     roundup = NULL;
14297
14298   /* Update ap.__[g|v]r_offs  */
14299   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14300               build_int_cst (TREE_TYPE (off), rsize));
14301   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
14302
14303   /* String up.  */
14304   if (roundup)
14305     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14306
14307   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
14308   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
14309               build_int_cst (TREE_TYPE (f_off), 0));
14310   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
14311
14312   /* String up: make sure the assignment happens before the use.  */
14313   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
14314   COND_EXPR_ELSE (cond1) = t;
14315
14316   /* Prepare the trees handling the argument that is passed on the stack;
14317      the top level node will store in ON_STACK.  */
14318   arg = get_initialized_tmp_var (stack, pre_p, NULL);
14319   if (align > 8)
14320     {
14321       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
14322       t = fold_build_pointer_plus_hwi (arg, 15);
14323       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14324                   build_int_cst (TREE_TYPE (t), -16));
14325       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
14326     }
14327   else
14328     roundup = NULL;
14329   /* Advance ap.__stack  */
14330   t = fold_build_pointer_plus_hwi (arg, size + 7);
14331   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14332               build_int_cst (TREE_TYPE (t), -8));
14333   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
14334   /* String up roundup and advance.  */
14335   if (roundup)
14336     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14337   /* String up with arg */
14338   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
14339   /* Big-endianness related address adjustment.  */
14340   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14341       && size < UNITS_PER_WORD)
14342   {
14343     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
14344                 size_int (UNITS_PER_WORD - size));
14345     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
14346   }
14347
14348   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
14349   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
14350
14351   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
14352   t = off;
14353   if (adjust)
14354     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
14355                 build_int_cst (TREE_TYPE (off), adjust));
14356
14357   t = fold_convert (sizetype, t);
14358   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
14359
14360   if (is_ha)
14361     {
14362       /* type ha; // treat as "struct {ftype field[n];}"
14363          ... [computing offs]
14364          for (i = 0; i <nregs; ++i, offs += 16)
14365            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
14366          return ha;  */
14367       int i;
14368       tree tmp_ha, field_t, field_ptr_t;
14369
14370       /* Declare a local variable.  */
14371       tmp_ha = create_tmp_var_raw (type, "ha");
14372       gimple_add_tmp_var (tmp_ha);
14373
14374       /* Establish the base type.  */
14375       switch (ag_mode)
14376         {
14377         case E_SFmode:
14378           field_t = float_type_node;
14379           field_ptr_t = float_ptr_type_node;
14380           break;
14381         case E_DFmode:
14382           field_t = double_type_node;
14383           field_ptr_t = double_ptr_type_node;
14384           break;
14385         case E_TFmode:
14386           field_t = long_double_type_node;
14387           field_ptr_t = long_double_ptr_type_node;
14388           break;
14389         case E_HFmode:
14390           field_t = aarch64_fp16_type_node;
14391           field_ptr_t = aarch64_fp16_ptr_type_node;
14392           break;
14393         case E_V2SImode:
14394         case E_V4SImode:
14395             {
14396               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
14397               field_t = build_vector_type_for_mode (innertype, ag_mode);
14398               field_ptr_t = build_pointer_type (field_t);
14399             }
14400           break;
14401         default:
14402           gcc_assert (0);
14403         }
14404
14405       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
14406       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
14407       addr = t;
14408       t = fold_convert (field_ptr_t, addr);
14409       t = build2 (MODIFY_EXPR, field_t,
14410                   build1 (INDIRECT_REF, field_t, tmp_ha),
14411                   build1 (INDIRECT_REF, field_t, t));
14412
14413       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
14414       for (i = 1; i < nregs; ++i)
14415         {
14416           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
14417           u = fold_convert (field_ptr_t, addr);
14418           u = build2 (MODIFY_EXPR, field_t,
14419                       build2 (MEM_REF, field_t, tmp_ha,
14420                               build_int_cst (field_ptr_t,
14421                                              (i *
14422                                               int_size_in_bytes (field_t)))),
14423                       build1 (INDIRECT_REF, field_t, u));
14424           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
14425         }
14426
14427       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
14428       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
14429     }
14430
14431   COND_EXPR_ELSE (cond2) = t;
14432   addr = fold_convert (build_pointer_type (type), cond1);
14433   addr = build_va_arg_indirect_ref (addr);
14434
14435   if (indirect_p)
14436     addr = build_va_arg_indirect_ref (addr);
14437
14438   return addr;
14439 }
14440
14441 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
14442
14443 static void
14444 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
14445                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
14446                                 int no_rtl)
14447 {
14448   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
14449   CUMULATIVE_ARGS local_cum;
14450   int gr_saved = cfun->va_list_gpr_size;
14451   int vr_saved = cfun->va_list_fpr_size;
14452
14453   /* The caller has advanced CUM up to, but not beyond, the last named
14454      argument.  Advance a local copy of CUM past the last "real" named
14455      argument, to find out how many registers are left over.  */
14456   local_cum = *cum;
14457   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
14458
14459   /* Found out how many registers we need to save.
14460      Honor tree-stdvar analysis results.  */
14461   if (cfun->va_list_gpr_size)
14462     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
14463                     cfun->va_list_gpr_size / UNITS_PER_WORD);
14464   if (cfun->va_list_fpr_size)
14465     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
14466                     cfun->va_list_fpr_size / UNITS_PER_VREG);
14467
14468   if (!TARGET_FLOAT)
14469     {
14470       gcc_assert (local_cum.aapcs_nvrn == 0);
14471       vr_saved = 0;
14472     }
14473
14474   if (!no_rtl)
14475     {
14476       if (gr_saved > 0)
14477         {
14478           rtx ptr, mem;
14479
14480           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
14481           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
14482                                - gr_saved * UNITS_PER_WORD);
14483           mem = gen_frame_mem (BLKmode, ptr);
14484           set_mem_alias_set (mem, get_varargs_alias_set ());
14485
14486           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
14487                                mem, gr_saved);
14488         }
14489       if (vr_saved > 0)
14490         {
14491           /* We can't use move_block_from_reg, because it will use
14492              the wrong mode, storing D regs only.  */
14493           machine_mode mode = TImode;
14494           int off, i, vr_start;
14495
14496           /* Set OFF to the offset from virtual_incoming_args_rtx of
14497              the first vector register.  The VR save area lies below
14498              the GR one, and is aligned to 16 bytes.  */
14499           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
14500                            STACK_BOUNDARY / BITS_PER_UNIT);
14501           off -= vr_saved * UNITS_PER_VREG;
14502
14503           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
14504           for (i = 0; i < vr_saved; ++i)
14505             {
14506               rtx ptr, mem;
14507
14508               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
14509               mem = gen_frame_mem (mode, ptr);
14510               set_mem_alias_set (mem, get_varargs_alias_set ());
14511               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
14512               off += UNITS_PER_VREG;
14513             }
14514         }
14515     }
14516
14517   /* We don't save the size into *PRETEND_SIZE because we want to avoid
14518      any complication of having crtl->args.pretend_args_size changed.  */
14519   cfun->machine->frame.saved_varargs_size
14520     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
14521                  STACK_BOUNDARY / BITS_PER_UNIT)
14522        + vr_saved * UNITS_PER_VREG);
14523 }
14524
14525 static void
14526 aarch64_conditional_register_usage (void)
14527 {
14528   int i;
14529   if (!TARGET_FLOAT)
14530     {
14531       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
14532         {
14533           fixed_regs[i] = 1;
14534           call_used_regs[i] = 1;
14535         }
14536     }
14537   if (!TARGET_SVE)
14538     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
14539       {
14540         fixed_regs[i] = 1;
14541         call_used_regs[i] = 1;
14542       }
14543
14544   /* When tracking speculation, we need a couple of call-clobbered registers
14545      to track the speculation state.  It would be nice to just use
14546      IP0 and IP1, but currently there are numerous places that just
14547      assume these registers are free for other uses (eg pointer
14548      authentication).  */
14549   if (aarch64_track_speculation)
14550     {
14551       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
14552       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
14553       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14554       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14555     }
14556 }
14557
14558 /* Walk down the type tree of TYPE counting consecutive base elements.
14559    If *MODEP is VOIDmode, then set it to the first valid floating point
14560    type.  If a non-floating point type is found, or if a floating point
14561    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
14562    otherwise return the count in the sub-tree.  */
14563 static int
14564 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
14565 {
14566   machine_mode mode;
14567   HOST_WIDE_INT size;
14568
14569   switch (TREE_CODE (type))
14570     {
14571     case REAL_TYPE:
14572       mode = TYPE_MODE (type);
14573       if (mode != DFmode && mode != SFmode
14574           && mode != TFmode && mode != HFmode)
14575         return -1;
14576
14577       if (*modep == VOIDmode)
14578         *modep = mode;
14579
14580       if (*modep == mode)
14581         return 1;
14582
14583       break;
14584
14585     case COMPLEX_TYPE:
14586       mode = TYPE_MODE (TREE_TYPE (type));
14587       if (mode != DFmode && mode != SFmode
14588           && mode != TFmode && mode != HFmode)
14589         return -1;
14590
14591       if (*modep == VOIDmode)
14592         *modep = mode;
14593
14594       if (*modep == mode)
14595         return 2;
14596
14597       break;
14598
14599     case VECTOR_TYPE:
14600       /* Use V2SImode and V4SImode as representatives of all 64-bit
14601          and 128-bit vector types.  */
14602       size = int_size_in_bytes (type);
14603       switch (size)
14604         {
14605         case 8:
14606           mode = V2SImode;
14607           break;
14608         case 16:
14609           mode = V4SImode;
14610           break;
14611         default:
14612           return -1;
14613         }
14614
14615       if (*modep == VOIDmode)
14616         *modep = mode;
14617
14618       /* Vector modes are considered to be opaque: two vectors are
14619          equivalent for the purposes of being homogeneous aggregates
14620          if they are the same size.  */
14621       if (*modep == mode)
14622         return 1;
14623
14624       break;
14625
14626     case ARRAY_TYPE:
14627       {
14628         int count;
14629         tree index = TYPE_DOMAIN (type);
14630
14631         /* Can't handle incomplete types nor sizes that are not
14632            fixed.  */
14633         if (!COMPLETE_TYPE_P (type)
14634             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14635           return -1;
14636
14637         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
14638         if (count == -1
14639             || !index
14640             || !TYPE_MAX_VALUE (index)
14641             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
14642             || !TYPE_MIN_VALUE (index)
14643             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
14644             || count < 0)
14645           return -1;
14646
14647         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
14648                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
14649
14650         /* There must be no padding.  */
14651         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14652                       count * GET_MODE_BITSIZE (*modep)))
14653           return -1;
14654
14655         return count;
14656       }
14657
14658     case RECORD_TYPE:
14659       {
14660         int count = 0;
14661         int sub_count;
14662         tree field;
14663
14664         /* Can't handle incomplete types nor sizes that are not
14665            fixed.  */
14666         if (!COMPLETE_TYPE_P (type)
14667             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14668           return -1;
14669
14670         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14671           {
14672             if (TREE_CODE (field) != FIELD_DECL)
14673               continue;
14674
14675             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14676             if (sub_count < 0)
14677               return -1;
14678             count += sub_count;
14679           }
14680
14681         /* There must be no padding.  */
14682         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14683                       count * GET_MODE_BITSIZE (*modep)))
14684           return -1;
14685
14686         return count;
14687       }
14688
14689     case UNION_TYPE:
14690     case QUAL_UNION_TYPE:
14691       {
14692         /* These aren't very interesting except in a degenerate case.  */
14693         int count = 0;
14694         int sub_count;
14695         tree field;
14696
14697         /* Can't handle incomplete types nor sizes that are not
14698            fixed.  */
14699         if (!COMPLETE_TYPE_P (type)
14700             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14701           return -1;
14702
14703         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14704           {
14705             if (TREE_CODE (field) != FIELD_DECL)
14706               continue;
14707
14708             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14709             if (sub_count < 0)
14710               return -1;
14711             count = count > sub_count ? count : sub_count;
14712           }
14713
14714         /* There must be no padding.  */
14715         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14716                       count * GET_MODE_BITSIZE (*modep)))
14717           return -1;
14718
14719         return count;
14720       }
14721
14722     default:
14723       break;
14724     }
14725
14726   return -1;
14727 }
14728
14729 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
14730    type as described in AAPCS64 \S 4.1.2.
14731
14732    See the comment above aarch64_composite_type_p for the notes on MODE.  */
14733
14734 static bool
14735 aarch64_short_vector_p (const_tree type,
14736                         machine_mode mode)
14737 {
14738   poly_int64 size = -1;
14739
14740   if (type && TREE_CODE (type) == VECTOR_TYPE)
14741     size = int_size_in_bytes (type);
14742   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
14743             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
14744     size = GET_MODE_SIZE (mode);
14745
14746   return known_eq (size, 8) || known_eq (size, 16);
14747 }
14748
14749 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
14750    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
14751    array types.  The C99 floating-point complex types are also considered
14752    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
14753    types, which are GCC extensions and out of the scope of AAPCS64, are
14754    treated as composite types here as well.
14755
14756    Note that MODE itself is not sufficient in determining whether a type
14757    is such a composite type or not.  This is because
14758    stor-layout.c:compute_record_mode may have already changed the MODE
14759    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
14760    structure with only one field may have its MODE set to the mode of the
14761    field.  Also an integer mode whose size matches the size of the
14762    RECORD_TYPE type may be used to substitute the original mode
14763    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
14764    solely relied on.  */
14765
14766 static bool
14767 aarch64_composite_type_p (const_tree type,
14768                           machine_mode mode)
14769 {
14770   if (aarch64_short_vector_p (type, mode))
14771     return false;
14772
14773   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
14774     return true;
14775
14776   if (mode == BLKmode
14777       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
14778       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
14779     return true;
14780
14781   return false;
14782 }
14783
14784 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
14785    shall be passed or returned in simd/fp register(s) (providing these
14786    parameter passing registers are available).
14787
14788    Upon successful return, *COUNT returns the number of needed registers,
14789    *BASE_MODE returns the mode of the individual register and when IS_HAF
14790    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
14791    floating-point aggregate or a homogeneous short-vector aggregate.  */
14792
14793 static bool
14794 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
14795                                          const_tree type,
14796                                          machine_mode *base_mode,
14797                                          int *count,
14798                                          bool *is_ha)
14799 {
14800   machine_mode new_mode = VOIDmode;
14801   bool composite_p = aarch64_composite_type_p (type, mode);
14802
14803   if (is_ha != NULL) *is_ha = false;
14804
14805   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
14806       || aarch64_short_vector_p (type, mode))
14807     {
14808       *count = 1;
14809       new_mode = mode;
14810     }
14811   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
14812     {
14813       if (is_ha != NULL) *is_ha = true;
14814       *count = 2;
14815       new_mode = GET_MODE_INNER (mode);
14816     }
14817   else if (type && composite_p)
14818     {
14819       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
14820
14821       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
14822         {
14823           if (is_ha != NULL) *is_ha = true;
14824           *count = ag_count;
14825         }
14826       else
14827         return false;
14828     }
14829   else
14830     return false;
14831
14832   *base_mode = new_mode;
14833   return true;
14834 }
14835
14836 /* Implement TARGET_STRUCT_VALUE_RTX.  */
14837
14838 static rtx
14839 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
14840                           int incoming ATTRIBUTE_UNUSED)
14841 {
14842   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
14843 }
14844
14845 /* Implements target hook vector_mode_supported_p.  */
14846 static bool
14847 aarch64_vector_mode_supported_p (machine_mode mode)
14848 {
14849   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14850   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
14851 }
14852
14853 /* Return the full-width SVE vector mode for element mode MODE, if one
14854    exists.  */
14855 opt_machine_mode
14856 aarch64_full_sve_mode (scalar_mode mode)
14857 {
14858   switch (mode)
14859     {
14860     case E_DFmode:
14861       return VNx2DFmode;
14862     case E_SFmode:
14863       return VNx4SFmode;
14864     case E_HFmode:
14865       return VNx8HFmode;
14866     case E_DImode:
14867         return VNx2DImode;
14868     case E_SImode:
14869       return VNx4SImode;
14870     case E_HImode:
14871       return VNx8HImode;
14872     case E_QImode:
14873       return VNx16QImode;
14874     default:
14875       return opt_machine_mode ();
14876     }
14877 }
14878
14879 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
14880    if it exists.  */
14881 opt_machine_mode
14882 aarch64_vq_mode (scalar_mode mode)
14883 {
14884   switch (mode)
14885     {
14886     case E_DFmode:
14887       return V2DFmode;
14888     case E_SFmode:
14889       return V4SFmode;
14890     case E_HFmode:
14891       return V8HFmode;
14892     case E_SImode:
14893       return V4SImode;
14894     case E_HImode:
14895       return V8HImode;
14896     case E_QImode:
14897       return V16QImode;
14898     case E_DImode:
14899       return V2DImode;
14900     default:
14901       return opt_machine_mode ();
14902     }
14903 }
14904
14905 /* Return appropriate SIMD container
14906    for MODE within a vector of WIDTH bits.  */
14907 static machine_mode
14908 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
14909 {
14910   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
14911     return aarch64_full_sve_mode (mode).else_mode (word_mode);
14912
14913   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
14914   if (TARGET_SIMD)
14915     {
14916       if (known_eq (width, 128))
14917         return aarch64_vq_mode (mode).else_mode (word_mode);
14918       else
14919         switch (mode)
14920           {
14921           case E_SFmode:
14922             return V2SFmode;
14923           case E_HFmode:
14924             return V4HFmode;
14925           case E_SImode:
14926             return V2SImode;
14927           case E_HImode:
14928             return V4HImode;
14929           case E_QImode:
14930             return V8QImode;
14931           default:
14932             break;
14933           }
14934     }
14935   return word_mode;
14936 }
14937
14938 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
14939 static machine_mode
14940 aarch64_preferred_simd_mode (scalar_mode mode)
14941 {
14942   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
14943   return aarch64_simd_container_mode (mode, bits);
14944 }
14945
14946 /* Return a list of possible vector sizes for the vectorizer
14947    to iterate over.  */
14948 static void
14949 aarch64_autovectorize_vector_sizes (vector_sizes *sizes, bool)
14950 {
14951   if (TARGET_SVE)
14952     sizes->safe_push (BYTES_PER_SVE_VECTOR);
14953   sizes->safe_push (16);
14954   sizes->safe_push (8);
14955 }
14956
14957 /* Implement TARGET_MANGLE_TYPE.  */
14958
14959 static const char *
14960 aarch64_mangle_type (const_tree type)
14961 {
14962   /* The AArch64 ABI documents say that "__va_list" has to be
14963      mangled as if it is in the "std" namespace.  */
14964   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
14965     return "St9__va_list";
14966
14967   /* Half-precision float.  */
14968   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
14969     return "Dh";
14970
14971   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
14972      builtin types.  */
14973   if (TYPE_NAME (type) != NULL)
14974     return aarch64_mangle_builtin_type (type);
14975
14976   /* Use the default mangling.  */
14977   return NULL;
14978 }
14979
14980 /* Find the first rtx_insn before insn that will generate an assembly
14981    instruction.  */
14982
14983 static rtx_insn *
14984 aarch64_prev_real_insn (rtx_insn *insn)
14985 {
14986   if (!insn)
14987     return NULL;
14988
14989   do
14990     {
14991       insn = prev_real_insn (insn);
14992     }
14993   while (insn && recog_memoized (insn) < 0);
14994
14995   return insn;
14996 }
14997
14998 static bool
14999 is_madd_op (enum attr_type t1)
15000 {
15001   unsigned int i;
15002   /* A number of these may be AArch32 only.  */
15003   enum attr_type mlatypes[] = {
15004     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
15005     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
15006     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
15007   };
15008
15009   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
15010     {
15011       if (t1 == mlatypes[i])
15012         return true;
15013     }
15014
15015   return false;
15016 }
15017
15018 /* Check if there is a register dependency between a load and the insn
15019    for which we hold recog_data.  */
15020
15021 static bool
15022 dep_between_memop_and_curr (rtx memop)
15023 {
15024   rtx load_reg;
15025   int opno;
15026
15027   gcc_assert (GET_CODE (memop) == SET);
15028
15029   if (!REG_P (SET_DEST (memop)))
15030     return false;
15031
15032   load_reg = SET_DEST (memop);
15033   for (opno = 1; opno < recog_data.n_operands; opno++)
15034     {
15035       rtx operand = recog_data.operand[opno];
15036       if (REG_P (operand)
15037           && reg_overlap_mentioned_p (load_reg, operand))
15038         return true;
15039
15040     }
15041   return false;
15042 }
15043
15044
15045 /* When working around the Cortex-A53 erratum 835769,
15046    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
15047    instruction and has a preceding memory instruction such that a NOP
15048    should be inserted between them.  */
15049
15050 bool
15051 aarch64_madd_needs_nop (rtx_insn* insn)
15052 {
15053   enum attr_type attr_type;
15054   rtx_insn *prev;
15055   rtx body;
15056
15057   if (!TARGET_FIX_ERR_A53_835769)
15058     return false;
15059
15060   if (!INSN_P (insn) || recog_memoized (insn) < 0)
15061     return false;
15062
15063   attr_type = get_attr_type (insn);
15064   if (!is_madd_op (attr_type))
15065     return false;
15066
15067   prev = aarch64_prev_real_insn (insn);
15068   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
15069      Restore recog state to INSN to avoid state corruption.  */
15070   extract_constrain_insn_cached (insn);
15071
15072   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
15073     return false;
15074
15075   body = single_set (prev);
15076
15077   /* If the previous insn is a memory op and there is no dependency between
15078      it and the DImode madd, emit a NOP between them.  If body is NULL then we
15079      have a complex memory operation, probably a load/store pair.
15080      Be conservative for now and emit a NOP.  */
15081   if (GET_MODE (recog_data.operand[0]) == DImode
15082       && (!body || !dep_between_memop_and_curr (body)))
15083     return true;
15084
15085   return false;
15086
15087 }
15088
15089
15090 /* Implement FINAL_PRESCAN_INSN.  */
15091
15092 void
15093 aarch64_final_prescan_insn (rtx_insn *insn)
15094 {
15095   if (aarch64_madd_needs_nop (insn))
15096     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
15097 }
15098
15099
15100 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
15101    instruction.  */
15102
15103 bool
15104 aarch64_sve_index_immediate_p (rtx base_or_step)
15105 {
15106   return (CONST_INT_P (base_or_step)
15107           && IN_RANGE (INTVAL (base_or_step), -16, 15));
15108 }
15109
15110 /* Return true if X is a valid immediate for the SVE ADD and SUB
15111    instructions.  Negate X first if NEGATE_P is true.  */
15112
15113 bool
15114 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
15115 {
15116   rtx elt;
15117
15118   if (!const_vec_duplicate_p (x, &elt)
15119       || !CONST_INT_P (elt))
15120     return false;
15121
15122   HOST_WIDE_INT val = INTVAL (elt);
15123   if (negate_p)
15124     val = -val;
15125   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
15126
15127   if (val & 0xff)
15128     return IN_RANGE (val, 0, 0xff);
15129   return IN_RANGE (val, 0, 0xff00);
15130 }
15131
15132 /* Return true if X is a valid immediate operand for an SVE logical
15133    instruction such as AND.  */
15134
15135 bool
15136 aarch64_sve_bitmask_immediate_p (rtx x)
15137 {
15138   rtx elt;
15139
15140   return (const_vec_duplicate_p (x, &elt)
15141           && CONST_INT_P (elt)
15142           && aarch64_bitmask_imm (INTVAL (elt),
15143                                   GET_MODE_INNER (GET_MODE (x))));
15144 }
15145
15146 /* Return true if X is a valid immediate for the SVE DUP and CPY
15147    instructions.  */
15148
15149 bool
15150 aarch64_sve_dup_immediate_p (rtx x)
15151 {
15152   x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
15153   if (!CONST_INT_P (x))
15154     return false;
15155
15156   HOST_WIDE_INT val = INTVAL (x);
15157   if (val & 0xff)
15158     return IN_RANGE (val, -0x80, 0x7f);
15159   return IN_RANGE (val, -0x8000, 0x7f00);
15160 }
15161
15162 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
15163    SIGNED_P says whether the operand is signed rather than unsigned.  */
15164
15165 bool
15166 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
15167 {
15168   rtx elt;
15169
15170   return (const_vec_duplicate_p (x, &elt)
15171           && CONST_INT_P (elt)
15172           && (signed_p
15173               ? IN_RANGE (INTVAL (elt), -16, 15)
15174               : IN_RANGE (INTVAL (elt), 0, 127)));
15175 }
15176
15177 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
15178    instruction.  Negate X first if NEGATE_P is true.  */
15179
15180 bool
15181 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
15182 {
15183   rtx elt;
15184   REAL_VALUE_TYPE r;
15185
15186   if (!const_vec_duplicate_p (x, &elt)
15187       || GET_CODE (elt) != CONST_DOUBLE)
15188     return false;
15189
15190   r = *CONST_DOUBLE_REAL_VALUE (elt);
15191
15192   if (negate_p)
15193     r = real_value_negate (&r);
15194
15195   if (real_equal (&r, &dconst1))
15196     return true;
15197   if (real_equal (&r, &dconsthalf))
15198     return true;
15199   return false;
15200 }
15201
15202 /* Return true if X is a valid immediate operand for an SVE FMUL
15203    instruction.  */
15204
15205 bool
15206 aarch64_sve_float_mul_immediate_p (rtx x)
15207 {
15208   rtx elt;
15209
15210   return (const_vec_duplicate_p (x, &elt)
15211           && GET_CODE (elt) == CONST_DOUBLE
15212           && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
15213               || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
15214 }
15215
15216 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
15217    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
15218    is nonnull, use it to describe valid immediates.  */
15219 static bool
15220 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
15221                                     simd_immediate_info *info,
15222                                     enum simd_immediate_check which,
15223                                     simd_immediate_info::insn_type insn)
15224 {
15225   /* Try a 4-byte immediate with LSL.  */
15226   for (unsigned int shift = 0; shift < 32; shift += 8)
15227     if ((val32 & (0xff << shift)) == val32)
15228       {
15229         if (info)
15230           *info = simd_immediate_info (SImode, val32 >> shift, insn,
15231                                        simd_immediate_info::LSL, shift);
15232         return true;
15233       }
15234
15235   /* Try a 2-byte immediate with LSL.  */
15236   unsigned int imm16 = val32 & 0xffff;
15237   if (imm16 == (val32 >> 16))
15238     for (unsigned int shift = 0; shift < 16; shift += 8)
15239       if ((imm16 & (0xff << shift)) == imm16)
15240         {
15241           if (info)
15242             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
15243                                          simd_immediate_info::LSL, shift);
15244           return true;
15245         }
15246
15247   /* Try a 4-byte immediate with MSL, except for cases that MVN
15248      can handle.  */
15249   if (which == AARCH64_CHECK_MOV)
15250     for (unsigned int shift = 8; shift < 24; shift += 8)
15251       {
15252         unsigned int low = (1 << shift) - 1;
15253         if (((val32 & (0xff << shift)) | low) == val32)
15254           {
15255             if (info)
15256               *info = simd_immediate_info (SImode, val32 >> shift, insn,
15257                                            simd_immediate_info::MSL, shift);
15258             return true;
15259           }
15260       }
15261
15262   return false;
15263 }
15264
15265 /* Return true if replicating VAL64 is a valid immediate for the
15266    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
15267    use it to describe valid immediates.  */
15268 static bool
15269 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
15270                                  simd_immediate_info *info,
15271                                  enum simd_immediate_check which)
15272 {
15273   unsigned int val32 = val64 & 0xffffffff;
15274   unsigned int val16 = val64 & 0xffff;
15275   unsigned int val8 = val64 & 0xff;
15276
15277   if (val32 == (val64 >> 32))
15278     {
15279       if ((which & AARCH64_CHECK_ORR) != 0
15280           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
15281                                                  simd_immediate_info::MOV))
15282         return true;
15283
15284       if ((which & AARCH64_CHECK_BIC) != 0
15285           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
15286                                                  simd_immediate_info::MVN))
15287         return true;
15288
15289       /* Try using a replicated byte.  */
15290       if (which == AARCH64_CHECK_MOV
15291           && val16 == (val32 >> 16)
15292           && val8 == (val16 >> 8))
15293         {
15294           if (info)
15295             *info = simd_immediate_info (QImode, val8);
15296           return true;
15297         }
15298     }
15299
15300   /* Try using a bit-to-bytemask.  */
15301   if (which == AARCH64_CHECK_MOV)
15302     {
15303       unsigned int i;
15304       for (i = 0; i < 64; i += 8)
15305         {
15306           unsigned char byte = (val64 >> i) & 0xff;
15307           if (byte != 0 && byte != 0xff)
15308             break;
15309         }
15310       if (i == 64)
15311         {
15312           if (info)
15313             *info = simd_immediate_info (DImode, val64);
15314           return true;
15315         }
15316     }
15317   return false;
15318 }
15319
15320 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
15321    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
15322
15323 static bool
15324 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
15325                              simd_immediate_info *info)
15326 {
15327   scalar_int_mode mode = DImode;
15328   unsigned int val32 = val64 & 0xffffffff;
15329   if (val32 == (val64 >> 32))
15330     {
15331       mode = SImode;
15332       unsigned int val16 = val32 & 0xffff;
15333       if (val16 == (val32 >> 16))
15334         {
15335           mode = HImode;
15336           unsigned int val8 = val16 & 0xff;
15337           if (val8 == (val16 >> 8))
15338             mode = QImode;
15339         }
15340     }
15341   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
15342   if (IN_RANGE (val, -0x80, 0x7f))
15343     {
15344       /* DUP with no shift.  */
15345       if (info)
15346         *info = simd_immediate_info (mode, val);
15347       return true;
15348     }
15349   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
15350     {
15351       /* DUP with LSL #8.  */
15352       if (info)
15353         *info = simd_immediate_info (mode, val);
15354       return true;
15355     }
15356   if (aarch64_bitmask_imm (val64, mode))
15357     {
15358       /* DUPM.  */
15359       if (info)
15360         *info = simd_immediate_info (mode, val);
15361       return true;
15362     }
15363   return false;
15364 }
15365
15366 /* Return true if X is a valid SVE predicate.  If INFO is nonnull, use
15367    it to describe valid immediates.  */
15368
15369 static bool
15370 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
15371 {
15372   if (x == CONST0_RTX (GET_MODE (x)))
15373     {
15374       if (info)
15375         *info = simd_immediate_info (DImode, 0);
15376       return true;
15377     }
15378
15379   /* Analyze the value as a VNx16BImode.  This should be relatively
15380      efficient, since rtx_vector_builder has enough built-in capacity
15381      to store all VLA predicate constants without needing the heap.  */
15382   rtx_vector_builder builder;
15383   if (!aarch64_get_sve_pred_bits (builder, x))
15384     return false;
15385
15386   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
15387   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
15388     {
15389       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
15390       aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
15391       if (pattern != AARCH64_NUM_SVPATTERNS)
15392         {
15393           if (info)
15394             {
15395               scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
15396               *info = simd_immediate_info (int_mode, pattern);
15397             }
15398           return true;
15399         }
15400     }
15401   return false;
15402 }
15403
15404 /* Return true if OP is a valid SIMD immediate for the operation
15405    described by WHICH.  If INFO is nonnull, use it to describe valid
15406    immediates.  */
15407 bool
15408 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
15409                               enum simd_immediate_check which)
15410 {
15411   machine_mode mode = GET_MODE (op);
15412   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15413   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15414     return false;
15415
15416   if (vec_flags & VEC_SVE_PRED)
15417     return aarch64_sve_pred_valid_immediate (op, info);
15418
15419   scalar_mode elt_mode = GET_MODE_INNER (mode);
15420   rtx base, step;
15421   unsigned int n_elts;
15422   if (GET_CODE (op) == CONST_VECTOR
15423       && CONST_VECTOR_DUPLICATE_P (op))
15424     n_elts = CONST_VECTOR_NPATTERNS (op);
15425   else if ((vec_flags & VEC_SVE_DATA)
15426            && const_vec_series_p (op, &base, &step))
15427     {
15428       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
15429       if (!aarch64_sve_index_immediate_p (base)
15430           || !aarch64_sve_index_immediate_p (step))
15431         return false;
15432
15433       if (info)
15434         *info = simd_immediate_info (elt_mode, base, step);
15435       return true;
15436     }
15437   else if (GET_CODE (op) == CONST_VECTOR
15438            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
15439     /* N_ELTS set above.  */;
15440   else
15441     return false;
15442
15443   scalar_float_mode elt_float_mode;
15444   if (n_elts == 1
15445       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
15446     {
15447       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
15448       if (aarch64_float_const_zero_rtx_p (elt)
15449           || aarch64_float_const_representable_p (elt))
15450         {
15451           if (info)
15452             *info = simd_immediate_info (elt_float_mode, elt);
15453           return true;
15454         }
15455     }
15456
15457   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
15458   if (elt_size > 8)
15459     return false;
15460
15461   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
15462
15463   /* Expand the vector constant out into a byte vector, with the least
15464      significant byte of the register first.  */
15465   auto_vec<unsigned char, 16> bytes;
15466   bytes.reserve (n_elts * elt_size);
15467   for (unsigned int i = 0; i < n_elts; i++)
15468     {
15469       /* The vector is provided in gcc endian-neutral fashion.
15470          For aarch64_be Advanced SIMD, it must be laid out in the vector
15471          register in reverse order.  */
15472       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
15473       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
15474
15475       if (elt_mode != elt_int_mode)
15476         elt = gen_lowpart (elt_int_mode, elt);
15477
15478       if (!CONST_INT_P (elt))
15479         return false;
15480
15481       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
15482       for (unsigned int byte = 0; byte < elt_size; byte++)
15483         {
15484           bytes.quick_push (elt_val & 0xff);
15485           elt_val >>= BITS_PER_UNIT;
15486         }
15487     }
15488
15489   /* The immediate must repeat every eight bytes.  */
15490   unsigned int nbytes = bytes.length ();
15491   for (unsigned i = 8; i < nbytes; ++i)
15492     if (bytes[i] != bytes[i - 8])
15493       return false;
15494
15495   /* Get the repeating 8-byte value as an integer.  No endian correction
15496      is needed here because bytes is already in lsb-first order.  */
15497   unsigned HOST_WIDE_INT val64 = 0;
15498   for (unsigned int i = 0; i < 8; i++)
15499     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
15500               << (i * BITS_PER_UNIT));
15501
15502   if (vec_flags & VEC_SVE_DATA)
15503     return aarch64_sve_valid_immediate (val64, info);
15504   else
15505     return aarch64_advsimd_valid_immediate (val64, info, which);
15506 }
15507
15508 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
15509    has a step in the range of INDEX.  Return the index expression if so,
15510    otherwise return null.  */
15511 rtx
15512 aarch64_check_zero_based_sve_index_immediate (rtx x)
15513 {
15514   rtx base, step;
15515   if (const_vec_series_p (x, &base, &step)
15516       && base == const0_rtx
15517       && aarch64_sve_index_immediate_p (step))
15518     return step;
15519   return NULL_RTX;
15520 }
15521
15522 /* Check of immediate shift constants are within range.  */
15523 bool
15524 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
15525 {
15526   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
15527   if (left)
15528     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
15529   else
15530     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
15531 }
15532
15533 /* Return the bitmask CONST_INT to select the bits required by a zero extract
15534    operation of width WIDTH at bit position POS.  */
15535
15536 rtx
15537 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
15538 {
15539   gcc_assert (CONST_INT_P (width));
15540   gcc_assert (CONST_INT_P (pos));
15541
15542   unsigned HOST_WIDE_INT mask
15543     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
15544   return GEN_INT (mask << UINTVAL (pos));
15545 }
15546
15547 bool
15548 aarch64_mov_operand_p (rtx x, machine_mode mode)
15549 {
15550   if (GET_CODE (x) == HIGH
15551       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
15552     return true;
15553
15554   if (CONST_INT_P (x))
15555     return true;
15556
15557   if (VECTOR_MODE_P (GET_MODE (x)))
15558     {
15559       /* Require predicate constants to be VNx16BI before RA, so that we
15560          force everything to have a canonical form.  */
15561       if (!lra_in_progress
15562           && !reload_completed
15563           && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
15564           && GET_MODE (x) != VNx16BImode)
15565         return false;
15566
15567       return aarch64_simd_valid_immediate (x, NULL);
15568     }
15569
15570   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
15571     return true;
15572
15573   if (aarch64_sve_cnt_immediate_p (x))
15574     return true;
15575
15576   return aarch64_classify_symbolic_expression (x)
15577     == SYMBOL_TINY_ABSOLUTE;
15578 }
15579
15580 /* Return a const_int vector of VAL.  */
15581 rtx
15582 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
15583 {
15584   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
15585   return gen_const_vec_duplicate (mode, c);
15586 }
15587
15588 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
15589
15590 bool
15591 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
15592 {
15593   machine_mode vmode;
15594
15595   vmode = aarch64_simd_container_mode (mode, 64);
15596   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
15597   return aarch64_simd_valid_immediate (op_v, NULL);
15598 }
15599
15600 /* Construct and return a PARALLEL RTX vector with elements numbering the
15601    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
15602    the vector - from the perspective of the architecture.  This does not
15603    line up with GCC's perspective on lane numbers, so we end up with
15604    different masks depending on our target endian-ness.  The diagram
15605    below may help.  We must draw the distinction when building masks
15606    which select one half of the vector.  An instruction selecting
15607    architectural low-lanes for a big-endian target, must be described using
15608    a mask selecting GCC high-lanes.
15609
15610                  Big-Endian             Little-Endian
15611
15612 GCC             0   1   2   3           3   2   1   0
15613               | x | x | x | x |       | x | x | x | x |
15614 Architecture    3   2   1   0           3   2   1   0
15615
15616 Low Mask:         { 2, 3 }                { 0, 1 }
15617 High Mask:        { 0, 1 }                { 2, 3 }
15618
15619    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
15620
15621 rtx
15622 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
15623 {
15624   rtvec v = rtvec_alloc (nunits / 2);
15625   int high_base = nunits / 2;
15626   int low_base = 0;
15627   int base;
15628   rtx t1;
15629   int i;
15630
15631   if (BYTES_BIG_ENDIAN)
15632     base = high ? low_base : high_base;
15633   else
15634     base = high ? high_base : low_base;
15635
15636   for (i = 0; i < nunits / 2; i++)
15637     RTVEC_ELT (v, i) = GEN_INT (base + i);
15638
15639   t1 = gen_rtx_PARALLEL (mode, v);
15640   return t1;
15641 }
15642
15643 /* Check OP for validity as a PARALLEL RTX vector with elements
15644    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
15645    from the perspective of the architecture.  See the diagram above
15646    aarch64_simd_vect_par_cnst_half for more details.  */
15647
15648 bool
15649 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
15650                                        bool high)
15651 {
15652   int nelts;
15653   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
15654     return false;
15655
15656   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
15657   HOST_WIDE_INT count_op = XVECLEN (op, 0);
15658   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
15659   int i = 0;
15660
15661   if (count_op != count_ideal)
15662     return false;
15663
15664   for (i = 0; i < count_ideal; i++)
15665     {
15666       rtx elt_op = XVECEXP (op, 0, i);
15667       rtx elt_ideal = XVECEXP (ideal, 0, i);
15668
15669       if (!CONST_INT_P (elt_op)
15670           || INTVAL (elt_ideal) != INTVAL (elt_op))
15671         return false;
15672     }
15673   return true;
15674 }
15675
15676 /* Return a PARALLEL containing NELTS elements, with element I equal
15677    to BASE + I * STEP.  */
15678
15679 rtx
15680 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
15681 {
15682   rtvec vec = rtvec_alloc (nelts);
15683   for (unsigned int i = 0; i < nelts; ++i)
15684     RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
15685   return gen_rtx_PARALLEL (VOIDmode, vec);
15686 }
15687
15688 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
15689    series with step STEP.  */
15690
15691 bool
15692 aarch64_stepped_int_parallel_p (rtx op, int step)
15693 {
15694   if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
15695     return false;
15696
15697   unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
15698   for (int i = 1; i < XVECLEN (op, 0); ++i)
15699     if (!CONST_INT_P (XVECEXP (op, 0, i))
15700         || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
15701       return false;
15702
15703   return true;
15704 }
15705
15706 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
15707    HIGH (exclusive).  */
15708 void
15709 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
15710                           const_tree exp)
15711 {
15712   HOST_WIDE_INT lane;
15713   gcc_assert (CONST_INT_P (operand));
15714   lane = INTVAL (operand);
15715
15716   if (lane < low || lane >= high)
15717   {
15718     if (exp)
15719       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
15720     else
15721       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
15722   }
15723 }
15724
15725 /* Peform endian correction on lane number N, which indexes a vector
15726    of mode MODE, and return the result as an SImode rtx.  */
15727
15728 rtx
15729 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
15730 {
15731   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
15732 }
15733
15734 /* Return TRUE if OP is a valid vector addressing mode.  */
15735
15736 bool
15737 aarch64_simd_mem_operand_p (rtx op)
15738 {
15739   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
15740                         || REG_P (XEXP (op, 0)));
15741 }
15742
15743 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
15744
15745 bool
15746 aarch64_sve_ld1r_operand_p (rtx op)
15747 {
15748   struct aarch64_address_info addr;
15749   scalar_mode mode;
15750
15751   return (MEM_P (op)
15752           && is_a <scalar_mode> (GET_MODE (op), &mode)
15753           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
15754           && addr.type == ADDRESS_REG_IMM
15755           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
15756 }
15757
15758 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
15759 bool
15760 aarch64_sve_ld1rq_operand_p (rtx op)
15761 {
15762   struct aarch64_address_info addr;
15763   scalar_mode elem_mode = GET_MODE_INNER (GET_MODE (op));
15764   if (!MEM_P (op)
15765       || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
15766     return false;
15767
15768   if (addr.type == ADDRESS_REG_IMM)
15769     return offset_4bit_signed_scaled_p (TImode, addr.const_offset);
15770
15771   if (addr.type == ADDRESS_REG_REG)
15772     return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
15773
15774   return false;
15775 }
15776
15777 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
15778    The conditions for STR are the same.  */
15779 bool
15780 aarch64_sve_ldr_operand_p (rtx op)
15781 {
15782   struct aarch64_address_info addr;
15783
15784   return (MEM_P (op)
15785           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
15786                                        false, ADDR_QUERY_ANY)
15787           && addr.type == ADDRESS_REG_IMM);
15788 }
15789
15790 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
15791    We need to be able to access the individual pieces, so the range
15792    is different from LD[234] and ST[234].  */
15793 bool
15794 aarch64_sve_struct_memory_operand_p (rtx op)
15795 {
15796   if (!MEM_P (op))
15797     return false;
15798
15799   machine_mode mode = GET_MODE (op);
15800   struct aarch64_address_info addr;
15801   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
15802                                  ADDR_QUERY_ANY)
15803       || addr.type != ADDRESS_REG_IMM)
15804     return false;
15805
15806   poly_int64 first = addr.const_offset;
15807   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
15808   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
15809           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
15810 }
15811
15812 /* Emit a register copy from operand to operand, taking care not to
15813    early-clobber source registers in the process.
15814
15815    COUNT is the number of components into which the copy needs to be
15816    decomposed.  */
15817 void
15818 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
15819                                 unsigned int count)
15820 {
15821   unsigned int i;
15822   int rdest = REGNO (operands[0]);
15823   int rsrc = REGNO (operands[1]);
15824
15825   if (!reg_overlap_mentioned_p (operands[0], operands[1])
15826       || rdest < rsrc)
15827     for (i = 0; i < count; i++)
15828       emit_move_insn (gen_rtx_REG (mode, rdest + i),
15829                       gen_rtx_REG (mode, rsrc + i));
15830   else
15831     for (i = 0; i < count; i++)
15832       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
15833                       gen_rtx_REG (mode, rsrc + count - i - 1));
15834 }
15835
15836 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
15837    one of VSTRUCT modes: OI, CI, or XI.  */
15838 int
15839 aarch64_simd_attr_length_rglist (machine_mode mode)
15840 {
15841   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
15842   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
15843 }
15844
15845 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
15846    alignment of a vector to 128 bits.  SVE predicates have an alignment of
15847    16 bits.  */
15848 static HOST_WIDE_INT
15849 aarch64_simd_vector_alignment (const_tree type)
15850 {
15851   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15852     /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
15853        be set for non-predicate vectors of booleans.  Modes are the most
15854        direct way we have of identifying real SVE predicate types.  */
15855     return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
15856   return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
15857 }
15858
15859 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
15860 static poly_uint64
15861 aarch64_vectorize_preferred_vector_alignment (const_tree type)
15862 {
15863   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
15864     {
15865       /* If the length of the vector is fixed, try to align to that length,
15866          otherwise don't try to align at all.  */
15867       HOST_WIDE_INT result;
15868       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
15869         result = TYPE_ALIGN (TREE_TYPE (type));
15870       return result;
15871     }
15872   return TYPE_ALIGN (type);
15873 }
15874
15875 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
15876 static bool
15877 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
15878 {
15879   if (is_packed)
15880     return false;
15881
15882   /* For fixed-length vectors, check that the vectorizer will aim for
15883      full-vector alignment.  This isn't true for generic GCC vectors
15884      that are wider than the ABI maximum of 128 bits.  */
15885   poly_uint64 preferred_alignment =
15886     aarch64_vectorize_preferred_vector_alignment (type);
15887   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
15888       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
15889                    preferred_alignment))
15890     return false;
15891
15892   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
15893   return true;
15894 }
15895
15896 /* Return true if the vector misalignment factor is supported by the
15897    target.  */
15898 static bool
15899 aarch64_builtin_support_vector_misalignment (machine_mode mode,
15900                                              const_tree type, int misalignment,
15901                                              bool is_packed)
15902 {
15903   if (TARGET_SIMD && STRICT_ALIGNMENT)
15904     {
15905       /* Return if movmisalign pattern is not supported for this mode.  */
15906       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
15907         return false;
15908
15909       /* Misalignment factor is unknown at compile time.  */
15910       if (misalignment == -1)
15911         return false;
15912     }
15913   return default_builtin_support_vector_misalignment (mode, type, misalignment,
15914                                                       is_packed);
15915 }
15916
15917 /* If VALS is a vector constant that can be loaded into a register
15918    using DUP, generate instructions to do so and return an RTX to
15919    assign to the register.  Otherwise return NULL_RTX.  */
15920 static rtx
15921 aarch64_simd_dup_constant (rtx vals)
15922 {
15923   machine_mode mode = GET_MODE (vals);
15924   machine_mode inner_mode = GET_MODE_INNER (mode);
15925   rtx x;
15926
15927   if (!const_vec_duplicate_p (vals, &x))
15928     return NULL_RTX;
15929
15930   /* We can load this constant by using DUP and a constant in a
15931      single ARM register.  This will be cheaper than a vector
15932      load.  */
15933   x = copy_to_mode_reg (inner_mode, x);
15934   return gen_vec_duplicate (mode, x);
15935 }
15936
15937
15938 /* Generate code to load VALS, which is a PARALLEL containing only
15939    constants (for vec_init) or CONST_VECTOR, efficiently into a
15940    register.  Returns an RTX to copy into the register, or NULL_RTX
15941    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
15942 static rtx
15943 aarch64_simd_make_constant (rtx vals)
15944 {
15945   machine_mode mode = GET_MODE (vals);
15946   rtx const_dup;
15947   rtx const_vec = NULL_RTX;
15948   int n_const = 0;
15949   int i;
15950
15951   if (GET_CODE (vals) == CONST_VECTOR)
15952     const_vec = vals;
15953   else if (GET_CODE (vals) == PARALLEL)
15954     {
15955       /* A CONST_VECTOR must contain only CONST_INTs and
15956          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
15957          Only store valid constants in a CONST_VECTOR.  */
15958       int n_elts = XVECLEN (vals, 0);
15959       for (i = 0; i < n_elts; ++i)
15960         {
15961           rtx x = XVECEXP (vals, 0, i);
15962           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15963             n_const++;
15964         }
15965       if (n_const == n_elts)
15966         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
15967     }
15968   else
15969     gcc_unreachable ();
15970
15971   if (const_vec != NULL_RTX
15972       && aarch64_simd_valid_immediate (const_vec, NULL))
15973     /* Load using MOVI/MVNI.  */
15974     return const_vec;
15975   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
15976     /* Loaded using DUP.  */
15977     return const_dup;
15978   else if (const_vec != NULL_RTX)
15979     /* Load from constant pool. We cannot take advantage of single-cycle
15980        LD1 because we need a PC-relative addressing mode.  */
15981     return const_vec;
15982   else
15983     /* A PARALLEL containing something not valid inside CONST_VECTOR.
15984        We cannot construct an initializer.  */
15985     return NULL_RTX;
15986 }
15987
15988 /* Expand a vector initialisation sequence, such that TARGET is
15989    initialised to contain VALS.  */
15990
15991 void
15992 aarch64_expand_vector_init (rtx target, rtx vals)
15993 {
15994   machine_mode mode = GET_MODE (target);
15995   scalar_mode inner_mode = GET_MODE_INNER (mode);
15996   /* The number of vector elements.  */
15997   int n_elts = XVECLEN (vals, 0);
15998   /* The number of vector elements which are not constant.  */
15999   int n_var = 0;
16000   rtx any_const = NULL_RTX;
16001   /* The first element of vals.  */
16002   rtx v0 = XVECEXP (vals, 0, 0);
16003   bool all_same = true;
16004
16005   /* This is a special vec_init<M><N> where N is not an element mode but a
16006      vector mode with half the elements of M.  We expect to find two entries
16007      of mode N in VALS and we must put their concatentation into TARGET.  */
16008   if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
16009     {
16010       gcc_assert (known_eq (GET_MODE_SIZE (mode),
16011                   2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
16012       rtx lo = XVECEXP (vals, 0, 0);
16013       rtx hi = XVECEXP (vals, 0, 1);
16014       machine_mode narrow_mode = GET_MODE (lo);
16015       gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
16016       gcc_assert (narrow_mode == GET_MODE (hi));
16017
16018       /* When we want to concatenate a half-width vector with zeroes we can
16019          use the aarch64_combinez[_be] patterns.  Just make sure that the
16020          zeroes are in the right half.  */
16021       if (BYTES_BIG_ENDIAN
16022           && aarch64_simd_imm_zero (lo, narrow_mode)
16023           && general_operand (hi, narrow_mode))
16024         emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
16025       else if (!BYTES_BIG_ENDIAN
16026                && aarch64_simd_imm_zero (hi, narrow_mode)
16027                && general_operand (lo, narrow_mode))
16028         emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
16029       else
16030         {
16031           /* Else create the two half-width registers and combine them.  */
16032           if (!REG_P (lo))
16033             lo = force_reg (GET_MODE (lo), lo);
16034           if (!REG_P (hi))
16035             hi = force_reg (GET_MODE (hi), hi);
16036
16037           if (BYTES_BIG_ENDIAN)
16038             std::swap (lo, hi);
16039           emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
16040         }
16041      return;
16042    }
16043
16044   /* Count the number of variable elements to initialise.  */
16045   for (int i = 0; i < n_elts; ++i)
16046     {
16047       rtx x = XVECEXP (vals, 0, i);
16048       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
16049         ++n_var;
16050       else
16051         any_const = x;
16052
16053       all_same &= rtx_equal_p (x, v0);
16054     }
16055
16056   /* No variable elements, hand off to aarch64_simd_make_constant which knows
16057      how best to handle this.  */
16058   if (n_var == 0)
16059     {
16060       rtx constant = aarch64_simd_make_constant (vals);
16061       if (constant != NULL_RTX)
16062         {
16063           emit_move_insn (target, constant);
16064           return;
16065         }
16066     }
16067
16068   /* Splat a single non-constant element if we can.  */
16069   if (all_same)
16070     {
16071       rtx x = copy_to_mode_reg (inner_mode, v0);
16072       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
16073       return;
16074     }
16075
16076   enum insn_code icode = optab_handler (vec_set_optab, mode);
16077   gcc_assert (icode != CODE_FOR_nothing);
16078
16079   /* If there are only variable elements, try to optimize
16080      the insertion using dup for the most common element
16081      followed by insertions.  */
16082
16083   /* The algorithm will fill matches[*][0] with the earliest matching element,
16084      and matches[X][1] with the count of duplicate elements (if X is the
16085      earliest element which has duplicates).  */
16086
16087   if (n_var == n_elts && n_elts <= 16)
16088     {
16089       int matches[16][2] = {0};
16090       for (int i = 0; i < n_elts; i++)
16091         {
16092           for (int j = 0; j <= i; j++)
16093             {
16094               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
16095                 {
16096                   matches[i][0] = j;
16097                   matches[j][1]++;
16098                   break;
16099                 }
16100             }
16101         }
16102       int maxelement = 0;
16103       int maxv = 0;
16104       for (int i = 0; i < n_elts; i++)
16105         if (matches[i][1] > maxv)
16106           {
16107             maxelement = i;
16108             maxv = matches[i][1];
16109           }
16110
16111       /* Create a duplicate of the most common element, unless all elements
16112          are equally useless to us, in which case just immediately set the
16113          vector register using the first element.  */
16114
16115       if (maxv == 1)
16116         {
16117           /* For vectors of two 64-bit elements, we can do even better.  */
16118           if (n_elts == 2
16119               && (inner_mode == E_DImode
16120                   || inner_mode == E_DFmode))
16121
16122             {
16123               rtx x0 = XVECEXP (vals, 0, 0);
16124               rtx x1 = XVECEXP (vals, 0, 1);
16125               /* Combine can pick up this case, but handling it directly
16126                  here leaves clearer RTL.
16127
16128                  This is load_pair_lanes<mode>, and also gives us a clean-up
16129                  for store_pair_lanes<mode>.  */
16130               if (memory_operand (x0, inner_mode)
16131                   && memory_operand (x1, inner_mode)
16132                   && !STRICT_ALIGNMENT
16133                   && rtx_equal_p (XEXP (x1, 0),
16134                                   plus_constant (Pmode,
16135                                                  XEXP (x0, 0),
16136                                                  GET_MODE_SIZE (inner_mode))))
16137                 {
16138                   rtx t;
16139                   if (inner_mode == DFmode)
16140                     t = gen_load_pair_lanesdf (target, x0, x1);
16141                   else
16142                     t = gen_load_pair_lanesdi (target, x0, x1);
16143                   emit_insn (t);
16144                   return;
16145                 }
16146             }
16147           /* The subreg-move sequence below will move into lane zero of the
16148              vector register.  For big-endian we want that position to hold
16149              the last element of VALS.  */
16150           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
16151           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
16152           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
16153         }
16154       else
16155         {
16156           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
16157           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
16158         }
16159
16160       /* Insert the rest.  */
16161       for (int i = 0; i < n_elts; i++)
16162         {
16163           rtx x = XVECEXP (vals, 0, i);
16164           if (matches[i][0] == maxelement)
16165             continue;
16166           x = copy_to_mode_reg (inner_mode, x);
16167           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
16168         }
16169       return;
16170     }
16171
16172   /* Initialise a vector which is part-variable.  We want to first try
16173      to build those lanes which are constant in the most efficient way we
16174      can.  */
16175   if (n_var != n_elts)
16176     {
16177       rtx copy = copy_rtx (vals);
16178
16179       /* Load constant part of vector.  We really don't care what goes into the
16180          parts we will overwrite, but we're more likely to be able to load the
16181          constant efficiently if it has fewer, larger, repeating parts
16182          (see aarch64_simd_valid_immediate).  */
16183       for (int i = 0; i < n_elts; i++)
16184         {
16185           rtx x = XVECEXP (vals, 0, i);
16186           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16187             continue;
16188           rtx subst = any_const;
16189           for (int bit = n_elts / 2; bit > 0; bit /= 2)
16190             {
16191               /* Look in the copied vector, as more elements are const.  */
16192               rtx test = XVECEXP (copy, 0, i ^ bit);
16193               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
16194                 {
16195                   subst = test;
16196                   break;
16197                 }
16198             }
16199           XVECEXP (copy, 0, i) = subst;
16200         }
16201       aarch64_expand_vector_init (target, copy);
16202     }
16203
16204   /* Insert the variable lanes directly.  */
16205   for (int i = 0; i < n_elts; i++)
16206     {
16207       rtx x = XVECEXP (vals, 0, i);
16208       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16209         continue;
16210       x = copy_to_mode_reg (inner_mode, x);
16211       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
16212     }
16213 }
16214
16215 /* Emit RTL corresponding to:
16216    insr TARGET, ELEM.  */
16217
16218 static void
16219 emit_insr (rtx target, rtx elem)
16220 {
16221   machine_mode mode = GET_MODE (target);
16222   scalar_mode elem_mode = GET_MODE_INNER (mode);
16223   elem = force_reg (elem_mode, elem);
16224
16225   insn_code icode = optab_handler (vec_shl_insert_optab, mode);
16226   gcc_assert (icode != CODE_FOR_nothing);
16227   emit_insn (GEN_FCN (icode) (target, target, elem));
16228 }
16229
16230 /* Subroutine of aarch64_sve_expand_vector_init for handling
16231    trailing constants.
16232    This function works as follows:
16233    (a) Create a new vector consisting of trailing constants.
16234    (b) Initialize TARGET with the constant vector using emit_move_insn.
16235    (c) Insert remaining elements in TARGET using insr.
16236    NELTS is the total number of elements in original vector while
16237    while NELTS_REQD is the number of elements that are actually
16238    significant.
16239
16240    ??? The heuristic used is to do above only if number of constants
16241    is at least half the total number of elements.  May need fine tuning.  */
16242
16243 static bool
16244 aarch64_sve_expand_vector_init_handle_trailing_constants
16245  (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
16246 {
16247   machine_mode mode = GET_MODE (target);
16248   scalar_mode elem_mode = GET_MODE_INNER (mode);
16249   int n_trailing_constants = 0;
16250
16251   for (int i = nelts_reqd - 1;
16252        i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
16253        i--)
16254     n_trailing_constants++;
16255
16256   if (n_trailing_constants >= nelts_reqd / 2)
16257     {
16258       rtx_vector_builder v (mode, 1, nelts);
16259       for (int i = 0; i < nelts; i++)
16260         v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
16261       rtx const_vec = v.build ();
16262       emit_move_insn (target, const_vec);
16263
16264       for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
16265         emit_insr (target, builder.elt (i));
16266
16267       return true;
16268     }
16269
16270   return false;
16271 }
16272
16273 /* Subroutine of aarch64_sve_expand_vector_init.
16274    Works as follows:
16275    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
16276    (b) Skip trailing elements from BUILDER, which are the same as
16277        element NELTS_REQD - 1.
16278    (c) Insert earlier elements in reverse order in TARGET using insr.  */
16279
16280 static void
16281 aarch64_sve_expand_vector_init_insert_elems (rtx target,
16282                                              const rtx_vector_builder &builder,
16283                                              int nelts_reqd)
16284 {
16285   machine_mode mode = GET_MODE (target);
16286   scalar_mode elem_mode = GET_MODE_INNER (mode);
16287
16288   struct expand_operand ops[2];
16289   enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
16290   gcc_assert (icode != CODE_FOR_nothing);
16291
16292   create_output_operand (&ops[0], target, mode);
16293   create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
16294   expand_insn (icode, 2, ops);
16295
16296   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16297   for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
16298     emit_insr (target, builder.elt (i));
16299 }
16300
16301 /* Subroutine of aarch64_sve_expand_vector_init to handle case
16302    when all trailing elements of builder are same.
16303    This works as follows:
16304    (a) Use expand_insn interface to broadcast last vector element in TARGET.
16305    (b) Insert remaining elements in TARGET using insr.
16306
16307    ??? The heuristic used is to do above if number of same trailing elements
16308    is at least 3/4 of total number of elements, loosely based on
16309    heuristic from mostly_zeros_p.  May need fine-tuning.  */
16310
16311 static bool
16312 aarch64_sve_expand_vector_init_handle_trailing_same_elem
16313  (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
16314 {
16315   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16316   if (ndups >= (3 * nelts_reqd) / 4)
16317     {
16318       aarch64_sve_expand_vector_init_insert_elems (target, builder,
16319                                                    nelts_reqd - ndups + 1);
16320       return true;
16321     }
16322
16323   return false;
16324 }
16325
16326 /* Initialize register TARGET from BUILDER. NELTS is the constant number
16327    of elements in BUILDER.
16328
16329    The function tries to initialize TARGET from BUILDER if it fits one
16330    of the special cases outlined below.
16331
16332    Failing that, the function divides BUILDER into two sub-vectors:
16333    v_even = even elements of BUILDER;
16334    v_odd = odd elements of BUILDER;
16335
16336    and recursively calls itself with v_even and v_odd.
16337
16338    if (recursive call succeeded for v_even or v_odd)
16339      TARGET = zip (v_even, v_odd)
16340
16341    The function returns true if it managed to build TARGET from BUILDER
16342    with one of the special cases, false otherwise.
16343
16344    Example: {a, 1, b, 2, c, 3, d, 4}
16345
16346    The vector gets divided into:
16347    v_even = {a, b, c, d}
16348    v_odd = {1, 2, 3, 4}
16349
16350    aarch64_sve_expand_vector_init(v_odd) hits case 1 and
16351    initialize tmp2 from constant vector v_odd using emit_move_insn.
16352
16353    aarch64_sve_expand_vector_init(v_even) fails since v_even contains
16354    4 elements, so we construct tmp1 from v_even using insr:
16355    tmp1 = dup(d)
16356    insr tmp1, c
16357    insr tmp1, b
16358    insr tmp1, a
16359
16360    And finally:
16361    TARGET = zip (tmp1, tmp2)
16362    which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
16363
16364 static bool
16365 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
16366                                 int nelts, int nelts_reqd)
16367 {
16368   machine_mode mode = GET_MODE (target);
16369
16370   /* Case 1: Vector contains trailing constants.  */
16371
16372   if (aarch64_sve_expand_vector_init_handle_trailing_constants
16373        (target, builder, nelts, nelts_reqd))
16374     return true;
16375
16376   /* Case 2: Vector contains leading constants.  */
16377
16378   rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
16379   for (int i = 0; i < nelts_reqd; i++)
16380     rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
16381   rev_builder.finalize ();
16382
16383   if (aarch64_sve_expand_vector_init_handle_trailing_constants
16384        (target, rev_builder, nelts, nelts_reqd))
16385     {
16386       emit_insn (gen_aarch64_sve_rev (mode, target, target));
16387       return true;
16388     }
16389
16390   /* Case 3: Vector contains trailing same element.  */
16391
16392   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16393        (target, builder, nelts_reqd))
16394     return true;
16395
16396   /* Case 4: Vector contains leading same element.  */
16397
16398   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16399        (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
16400     {
16401       emit_insn (gen_aarch64_sve_rev (mode, target, target));
16402       return true;
16403     }
16404
16405   /* Avoid recursing below 4-elements.
16406      ??? The threshold 4 may need fine-tuning.  */
16407
16408   if (nelts_reqd <= 4)
16409     return false;
16410
16411   rtx_vector_builder v_even (mode, 1, nelts);
16412   rtx_vector_builder v_odd (mode, 1, nelts);
16413
16414   for (int i = 0; i < nelts * 2; i += 2)
16415     {
16416       v_even.quick_push (builder.elt (i));
16417       v_odd.quick_push (builder.elt (i + 1));
16418     }
16419
16420   v_even.finalize ();
16421   v_odd.finalize ();
16422
16423   rtx tmp1 = gen_reg_rtx (mode);
16424   bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
16425                                                     nelts, nelts_reqd / 2);
16426
16427   rtx tmp2 = gen_reg_rtx (mode);
16428   bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
16429                                                    nelts, nelts_reqd / 2);
16430
16431   if (!did_even_p && !did_odd_p)
16432     return false;
16433
16434   /* Initialize v_even and v_odd using INSR if it didn't match any of the
16435      special cases and zip v_even, v_odd.  */
16436
16437   if (!did_even_p)
16438     aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
16439
16440   if (!did_odd_p)
16441     aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
16442
16443   rtvec v = gen_rtvec (2, tmp1, tmp2);
16444   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
16445   return true;
16446 }
16447
16448 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
16449
16450 void
16451 aarch64_sve_expand_vector_init (rtx target, rtx vals)
16452 {
16453   machine_mode mode = GET_MODE (target);
16454   int nelts = XVECLEN (vals, 0);
16455
16456   rtx_vector_builder v (mode, 1, nelts);
16457   for (int i = 0; i < nelts; i++)
16458     v.quick_push (XVECEXP (vals, 0, i));
16459   v.finalize ();
16460
16461   /* If neither sub-vectors of v could be initialized specially,
16462      then use INSR to insert all elements from v into TARGET.
16463      ??? This might not be optimal for vectors with large
16464      initializers like 16-element or above.
16465      For nelts < 4, it probably isn't useful to handle specially.  */
16466
16467   if (nelts < 4
16468       || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
16469     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
16470 }
16471
16472 static unsigned HOST_WIDE_INT
16473 aarch64_shift_truncation_mask (machine_mode mode)
16474 {
16475   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
16476     return 0;
16477   return GET_MODE_UNIT_BITSIZE (mode) - 1;
16478 }
16479
16480 /* Select a format to encode pointers in exception handling data.  */
16481 int
16482 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
16483 {
16484    int type;
16485    switch (aarch64_cmodel)
16486      {
16487      case AARCH64_CMODEL_TINY:
16488      case AARCH64_CMODEL_TINY_PIC:
16489      case AARCH64_CMODEL_SMALL:
16490      case AARCH64_CMODEL_SMALL_PIC:
16491      case AARCH64_CMODEL_SMALL_SPIC:
16492        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
16493           for everything.  */
16494        type = DW_EH_PE_sdata4;
16495        break;
16496      default:
16497        /* No assumptions here.  8-byte relocs required.  */
16498        type = DW_EH_PE_sdata8;
16499        break;
16500      }
16501    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
16502 }
16503
16504 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
16505
16506 static void
16507 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
16508 {
16509   if (aarch64_simd_decl_p (decl))
16510     {
16511       fprintf (stream, "\t.variant_pcs\t");
16512       assemble_name (stream, name);
16513       fprintf (stream, "\n");
16514     }
16515 }
16516
16517 /* The last .arch and .tune assembly strings that we printed.  */
16518 static std::string aarch64_last_printed_arch_string;
16519 static std::string aarch64_last_printed_tune_string;
16520
16521 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
16522    by the function fndecl.  */
16523
16524 void
16525 aarch64_declare_function_name (FILE *stream, const char* name,
16526                                 tree fndecl)
16527 {
16528   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
16529
16530   struct cl_target_option *targ_options;
16531   if (target_parts)
16532     targ_options = TREE_TARGET_OPTION (target_parts);
16533   else
16534     targ_options = TREE_TARGET_OPTION (target_option_current_node);
16535   gcc_assert (targ_options);
16536
16537   const struct processor *this_arch
16538     = aarch64_get_arch (targ_options->x_explicit_arch);
16539
16540   uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
16541   std::string extension
16542     = aarch64_get_extension_string_for_isa_flags (isa_flags,
16543                                                   this_arch->flags);
16544   /* Only update the assembler .arch string if it is distinct from the last
16545      such string we printed.  */
16546   std::string to_print = this_arch->name + extension;
16547   if (to_print != aarch64_last_printed_arch_string)
16548     {
16549       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
16550       aarch64_last_printed_arch_string = to_print;
16551     }
16552
16553   /* Print the cpu name we're tuning for in the comments, might be
16554      useful to readers of the generated asm.  Do it only when it changes
16555      from function to function and verbose assembly is requested.  */
16556   const struct processor *this_tune
16557     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
16558
16559   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
16560     {
16561       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
16562                    this_tune->name);
16563       aarch64_last_printed_tune_string = this_tune->name;
16564     }
16565
16566   aarch64_asm_output_variant_pcs (stream, fndecl, name);
16567
16568   /* Don't forget the type directive for ELF.  */
16569   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
16570   ASM_OUTPUT_LABEL (stream, name);
16571 }
16572
16573 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
16574
16575 void
16576 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
16577 {
16578   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
16579   const char *value = IDENTIFIER_POINTER (target);
16580   aarch64_asm_output_variant_pcs (stream, decl, name);
16581   ASM_OUTPUT_DEF (stream, name, value);
16582 }
16583
16584 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
16585    function symbol references.  */
16586
16587 void
16588 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
16589 {
16590   default_elf_asm_output_external (stream, decl, name);
16591   aarch64_asm_output_variant_pcs (stream, decl, name);
16592 }
16593
16594 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
16595    Used to output the .cfi_b_key_frame directive when signing the current
16596    function with the B key.  */
16597
16598 void
16599 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
16600 {
16601   if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
16602       && aarch64_ra_sign_key == AARCH64_KEY_B)
16603         asm_fprintf (f, "\t.cfi_b_key_frame\n");
16604 }
16605
16606 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
16607
16608 static void
16609 aarch64_start_file (void)
16610 {
16611   struct cl_target_option *default_options
16612     = TREE_TARGET_OPTION (target_option_default_node);
16613
16614   const struct processor *default_arch
16615     = aarch64_get_arch (default_options->x_explicit_arch);
16616   uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
16617   std::string extension
16618     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
16619                                                   default_arch->flags);
16620
16621    aarch64_last_printed_arch_string = default_arch->name + extension;
16622    aarch64_last_printed_tune_string = "";
16623    asm_fprintf (asm_out_file, "\t.arch %s\n",
16624                 aarch64_last_printed_arch_string.c_str ());
16625
16626    default_file_start ();
16627 }
16628
16629 /* Emit load exclusive.  */
16630
16631 static void
16632 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
16633                              rtx mem, rtx model_rtx)
16634 {
16635   emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
16636 }
16637
16638 /* Emit store exclusive.  */
16639
16640 static void
16641 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
16642                               rtx rval, rtx mem, rtx model_rtx)
16643 {
16644   emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
16645 }
16646
16647 /* Mark the previous jump instruction as unlikely.  */
16648
16649 static void
16650 aarch64_emit_unlikely_jump (rtx insn)
16651 {
16652   rtx_insn *jump = emit_jump_insn (insn);
16653   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
16654 }
16655
16656 /* Expand a compare and swap pattern.  */
16657
16658 void
16659 aarch64_expand_compare_and_swap (rtx operands[])
16660 {
16661   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
16662   machine_mode mode, r_mode;
16663
16664   bval = operands[0];
16665   rval = operands[1];
16666   mem = operands[2];
16667   oldval = operands[3];
16668   newval = operands[4];
16669   is_weak = operands[5];
16670   mod_s = operands[6];
16671   mod_f = operands[7];
16672   mode = GET_MODE (mem);
16673
16674   /* Normally the succ memory model must be stronger than fail, but in the
16675      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
16676      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
16677   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
16678       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
16679     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
16680
16681   r_mode = mode;
16682   if (mode == QImode || mode == HImode)
16683     {
16684       r_mode = SImode;
16685       rval = gen_reg_rtx (r_mode);
16686     }
16687
16688   if (TARGET_LSE)
16689     {
16690       /* The CAS insn requires oldval and rval overlap, but we need to
16691          have a copy of oldval saved across the operation to tell if
16692          the operation is successful.  */
16693       if (reg_overlap_mentioned_p (rval, oldval))
16694         rval = copy_to_mode_reg (r_mode, oldval);
16695       else
16696         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
16697
16698       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
16699                                                    newval, mod_s));
16700       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
16701     }
16702   else
16703     {
16704       /* The oldval predicate varies by mode.  Test it and force to reg.  */
16705       insn_code code = code_for_aarch64_compare_and_swap (mode);
16706       if (!insn_data[code].operand[2].predicate (oldval, mode))
16707         oldval = force_reg (mode, oldval);
16708
16709       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
16710                                  is_weak, mod_s, mod_f));
16711       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
16712     }
16713
16714   if (r_mode != mode)
16715     rval = gen_lowpart (mode, rval);
16716   emit_move_insn (operands[1], rval);
16717
16718   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
16719   emit_insn (gen_rtx_SET (bval, x));
16720 }
16721
16722 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
16723    sequence implementing an atomic operation.  */
16724
16725 static void
16726 aarch64_emit_post_barrier (enum memmodel model)
16727 {
16728   const enum memmodel base_model = memmodel_base (model);
16729
16730   if (is_mm_sync (model)
16731       && (base_model == MEMMODEL_ACQUIRE
16732           || base_model == MEMMODEL_ACQ_REL
16733           || base_model == MEMMODEL_SEQ_CST))
16734     {
16735       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
16736     }
16737 }
16738
16739 /* Split a compare and swap pattern.  */
16740
16741 void
16742 aarch64_split_compare_and_swap (rtx operands[])
16743 {
16744   rtx rval, mem, oldval, newval, scratch;
16745   machine_mode mode;
16746   bool is_weak;
16747   rtx_code_label *label1, *label2;
16748   rtx x, cond;
16749   enum memmodel model;
16750   rtx model_rtx;
16751
16752   rval = operands[0];
16753   mem = operands[1];
16754   oldval = operands[2];
16755   newval = operands[3];
16756   is_weak = (operands[4] != const0_rtx);
16757   model_rtx = operands[5];
16758   scratch = operands[7];
16759   mode = GET_MODE (mem);
16760   model = memmodel_from_int (INTVAL (model_rtx));
16761
16762   /* When OLDVAL is zero and we want the strong version we can emit a tighter
16763     loop:
16764     .label1:
16765         LD[A]XR rval, [mem]
16766         CBNZ    rval, .label2
16767         ST[L]XR scratch, newval, [mem]
16768         CBNZ    scratch, .label1
16769     .label2:
16770         CMP     rval, 0.  */
16771   bool strong_zero_p = !is_weak && oldval == const0_rtx;
16772
16773   label1 = NULL;
16774   if (!is_weak)
16775     {
16776       label1 = gen_label_rtx ();
16777       emit_label (label1);
16778     }
16779   label2 = gen_label_rtx ();
16780
16781   /* The initial load can be relaxed for a __sync operation since a final
16782      barrier will be emitted to stop code hoisting.  */
16783   if (is_mm_sync (model))
16784     aarch64_emit_load_exclusive (mode, rval, mem,
16785                                  GEN_INT (MEMMODEL_RELAXED));
16786   else
16787     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
16788
16789   if (strong_zero_p)
16790     {
16791       if (aarch64_track_speculation)
16792         {
16793           /* Emit an explicit compare instruction, so that we can correctly
16794              track the condition codes.  */
16795           rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
16796           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16797         }
16798       else
16799         x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
16800
16801       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16802                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
16803       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16804     }
16805   else
16806     {
16807       cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
16808       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
16809       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16810                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
16811       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16812     }
16813
16814   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
16815
16816   if (!is_weak)
16817     {
16818       if (aarch64_track_speculation)
16819         {
16820           /* Emit an explicit compare instruction, so that we can correctly
16821              track the condition codes.  */
16822           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
16823           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16824         }
16825       else
16826         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
16827
16828       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16829                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
16830       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16831     }
16832   else
16833     {
16834       cond = gen_rtx_REG (CCmode, CC_REGNUM);
16835       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
16836       emit_insn (gen_rtx_SET (cond, x));
16837     }
16838
16839   emit_label (label2);
16840   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
16841      to set the condition flags.  If this is not used it will be removed by
16842      later passes.  */
16843   if (strong_zero_p)
16844     {
16845       cond = gen_rtx_REG (CCmode, CC_REGNUM);
16846       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
16847       emit_insn (gen_rtx_SET (cond, x));
16848     }
16849   /* Emit any final barrier needed for a __sync operation.  */
16850   if (is_mm_sync (model))
16851     aarch64_emit_post_barrier (model);
16852 }
16853
16854 /* Split an atomic operation.  */
16855
16856 void
16857 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
16858                          rtx value, rtx model_rtx, rtx cond)
16859 {
16860   machine_mode mode = GET_MODE (mem);
16861   machine_mode wmode = (mode == DImode ? DImode : SImode);
16862   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
16863   const bool is_sync = is_mm_sync (model);
16864   rtx_code_label *label;
16865   rtx x;
16866
16867   /* Split the atomic operation into a sequence.  */
16868   label = gen_label_rtx ();
16869   emit_label (label);
16870
16871   if (new_out)
16872     new_out = gen_lowpart (wmode, new_out);
16873   if (old_out)
16874     old_out = gen_lowpart (wmode, old_out);
16875   else
16876     old_out = new_out;
16877   value = simplify_gen_subreg (wmode, value, mode, 0);
16878
16879   /* The initial load can be relaxed for a __sync operation since a final
16880      barrier will be emitted to stop code hoisting.  */
16881  if (is_sync)
16882     aarch64_emit_load_exclusive (mode, old_out, mem,
16883                                  GEN_INT (MEMMODEL_RELAXED));
16884   else
16885     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
16886
16887   switch (code)
16888     {
16889     case SET:
16890       new_out = value;
16891       break;
16892
16893     case NOT:
16894       x = gen_rtx_AND (wmode, old_out, value);
16895       emit_insn (gen_rtx_SET (new_out, x));
16896       x = gen_rtx_NOT (wmode, new_out);
16897       emit_insn (gen_rtx_SET (new_out, x));
16898       break;
16899
16900     case MINUS:
16901       if (CONST_INT_P (value))
16902         {
16903           value = GEN_INT (-INTVAL (value));
16904           code = PLUS;
16905         }
16906       /* Fall through.  */
16907
16908     default:
16909       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
16910       emit_insn (gen_rtx_SET (new_out, x));
16911       break;
16912     }
16913
16914   aarch64_emit_store_exclusive (mode, cond, mem,
16915                                 gen_lowpart (mode, new_out), model_rtx);
16916
16917   if (aarch64_track_speculation)
16918     {
16919       /* Emit an explicit compare instruction, so that we can correctly
16920          track the condition codes.  */
16921       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
16922       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16923     }
16924   else
16925     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
16926
16927   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16928                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
16929   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16930
16931   /* Emit any final barrier needed for a __sync operation.  */
16932   if (is_sync)
16933     aarch64_emit_post_barrier (model);
16934 }
16935
16936 static void
16937 aarch64_init_libfuncs (void)
16938 {
16939    /* Half-precision float operations.  The compiler handles all operations
16940      with NULL libfuncs by converting to SFmode.  */
16941
16942   /* Conversions.  */
16943   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
16944   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
16945
16946   /* Arithmetic.  */
16947   set_optab_libfunc (add_optab, HFmode, NULL);
16948   set_optab_libfunc (sdiv_optab, HFmode, NULL);
16949   set_optab_libfunc (smul_optab, HFmode, NULL);
16950   set_optab_libfunc (neg_optab, HFmode, NULL);
16951   set_optab_libfunc (sub_optab, HFmode, NULL);
16952
16953   /* Comparisons.  */
16954   set_optab_libfunc (eq_optab, HFmode, NULL);
16955   set_optab_libfunc (ne_optab, HFmode, NULL);
16956   set_optab_libfunc (lt_optab, HFmode, NULL);
16957   set_optab_libfunc (le_optab, HFmode, NULL);
16958   set_optab_libfunc (ge_optab, HFmode, NULL);
16959   set_optab_libfunc (gt_optab, HFmode, NULL);
16960   set_optab_libfunc (unord_optab, HFmode, NULL);
16961 }
16962
16963 /* Target hook for c_mode_for_suffix.  */
16964 static machine_mode
16965 aarch64_c_mode_for_suffix (char suffix)
16966 {
16967   if (suffix == 'q')
16968     return TFmode;
16969
16970   return VOIDmode;
16971 }
16972
16973 /* We can only represent floating point constants which will fit in
16974    "quarter-precision" values.  These values are characterised by
16975    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
16976    by:
16977
16978    (-1)^s * (n/16) * 2^r
16979
16980    Where:
16981      's' is the sign bit.
16982      'n' is an integer in the range 16 <= n <= 31.
16983      'r' is an integer in the range -3 <= r <= 4.  */
16984
16985 /* Return true iff X can be represented by a quarter-precision
16986    floating point immediate operand X.  Note, we cannot represent 0.0.  */
16987 bool
16988 aarch64_float_const_representable_p (rtx x)
16989 {
16990   /* This represents our current view of how many bits
16991      make up the mantissa.  */
16992   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
16993   int exponent;
16994   unsigned HOST_WIDE_INT mantissa, mask;
16995   REAL_VALUE_TYPE r, m;
16996   bool fail;
16997
16998   x = unwrap_const_vec_duplicate (x);
16999   if (!CONST_DOUBLE_P (x))
17000     return false;
17001
17002   if (GET_MODE (x) == VOIDmode
17003       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
17004     return false;
17005
17006   r = *CONST_DOUBLE_REAL_VALUE (x);
17007
17008   /* We cannot represent infinities, NaNs or +/-zero.  We won't
17009      know if we have +zero until we analyse the mantissa, but we
17010      can reject the other invalid values.  */
17011   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
17012       || REAL_VALUE_MINUS_ZERO (r))
17013     return false;
17014
17015   /* Extract exponent.  */
17016   r = real_value_abs (&r);
17017   exponent = REAL_EXP (&r);
17018
17019   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
17020      highest (sign) bit, with a fixed binary point at bit point_pos.
17021      m1 holds the low part of the mantissa, m2 the high part.
17022      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
17023      bits for the mantissa, this can fail (low bits will be lost).  */
17024   real_ldexp (&m, &r, point_pos - exponent);
17025   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
17026
17027   /* If the low part of the mantissa has bits set we cannot represent
17028      the value.  */
17029   if (w.ulow () != 0)
17030     return false;
17031   /* We have rejected the lower HOST_WIDE_INT, so update our
17032      understanding of how many bits lie in the mantissa and
17033      look only at the high HOST_WIDE_INT.  */
17034   mantissa = w.elt (1);
17035   point_pos -= HOST_BITS_PER_WIDE_INT;
17036
17037   /* We can only represent values with a mantissa of the form 1.xxxx.  */
17038   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
17039   if ((mantissa & mask) != 0)
17040     return false;
17041
17042   /* Having filtered unrepresentable values, we may now remove all
17043      but the highest 5 bits.  */
17044   mantissa >>= point_pos - 5;
17045
17046   /* We cannot represent the value 0.0, so reject it.  This is handled
17047      elsewhere.  */
17048   if (mantissa == 0)
17049     return false;
17050
17051   /* Then, as bit 4 is always set, we can mask it off, leaving
17052      the mantissa in the range [0, 15].  */
17053   mantissa &= ~(1 << 4);
17054   gcc_assert (mantissa <= 15);
17055
17056   /* GCC internally does not use IEEE754-like encoding (where normalized
17057      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
17058      Our mantissa values are shifted 4 places to the left relative to
17059      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
17060      by 5 places to correct for GCC's representation.  */
17061   exponent = 5 - exponent;
17062
17063   return (exponent >= 0 && exponent <= 7);
17064 }
17065
17066 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
17067    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
17068    output MOVI/MVNI, ORR or BIC immediate.  */
17069 char*
17070 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
17071                                    enum simd_immediate_check which)
17072 {
17073   bool is_valid;
17074   static char templ[40];
17075   const char *mnemonic;
17076   const char *shift_op;
17077   unsigned int lane_count = 0;
17078   char element_char;
17079
17080   struct simd_immediate_info info;
17081
17082   /* This will return true to show const_vector is legal for use as either
17083      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
17084      It will also update INFO to show how the immediate should be generated.
17085      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
17086   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
17087   gcc_assert (is_valid);
17088
17089   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
17090   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
17091
17092   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
17093     {
17094       gcc_assert (info.insn == simd_immediate_info::MOV
17095                   && info.u.mov.shift == 0);
17096       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
17097          move immediate path.  */
17098       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
17099         info.u.mov.value = GEN_INT (0);
17100       else
17101         {
17102           const unsigned int buf_size = 20;
17103           char float_buf[buf_size] = {'\0'};
17104           real_to_decimal_for_mode (float_buf,
17105                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17106                                     buf_size, buf_size, 1, info.elt_mode);
17107
17108           if (lane_count == 1)
17109             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
17110           else
17111             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
17112                       lane_count, element_char, float_buf);
17113           return templ;
17114         }
17115     }
17116
17117   gcc_assert (CONST_INT_P (info.u.mov.value));
17118
17119   if (which == AARCH64_CHECK_MOV)
17120     {
17121       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
17122       shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
17123                   ? "msl" : "lsl");
17124       if (lane_count == 1)
17125         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
17126                   mnemonic, UINTVAL (info.u.mov.value));
17127       else if (info.u.mov.shift)
17128         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
17129                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
17130                   element_char, UINTVAL (info.u.mov.value), shift_op,
17131                   info.u.mov.shift);
17132       else
17133         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
17134                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
17135                   element_char, UINTVAL (info.u.mov.value));
17136     }
17137   else
17138     {
17139       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
17140       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
17141       if (info.u.mov.shift)
17142         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
17143                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
17144                   element_char, UINTVAL (info.u.mov.value), "lsl",
17145                   info.u.mov.shift);
17146       else
17147         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
17148                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
17149                   element_char, UINTVAL (info.u.mov.value));
17150     }
17151   return templ;
17152 }
17153
17154 char*
17155 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
17156 {
17157
17158   /* If a floating point number was passed and we desire to use it in an
17159      integer mode do the conversion to integer.  */
17160   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
17161     {
17162       unsigned HOST_WIDE_INT ival;
17163       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
17164           gcc_unreachable ();
17165       immediate = gen_int_mode (ival, mode);
17166     }
17167
17168   machine_mode vmode;
17169   /* use a 64 bit mode for everything except for DI/DF mode, where we use
17170      a 128 bit vector mode.  */
17171   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
17172
17173   vmode = aarch64_simd_container_mode (mode, width);
17174   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
17175   return aarch64_output_simd_mov_immediate (v_op, width);
17176 }
17177
17178 /* Return the output string to use for moving immediate CONST_VECTOR
17179    into an SVE register.  */
17180
17181 char *
17182 aarch64_output_sve_mov_immediate (rtx const_vector)
17183 {
17184   static char templ[40];
17185   struct simd_immediate_info info;
17186   char element_char;
17187
17188   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
17189   gcc_assert (is_valid);
17190
17191   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
17192
17193   machine_mode vec_mode = GET_MODE (const_vector);
17194   if (aarch64_sve_pred_mode_p (vec_mode))
17195     {
17196       static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
17197       if (info.insn == simd_immediate_info::MOV)
17198         {
17199           gcc_assert (info.u.mov.value == const0_rtx);
17200           snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
17201         }
17202       else
17203         {
17204           gcc_assert (info.insn == simd_immediate_info::PTRUE);
17205           unsigned int total_bytes;
17206           if (info.u.pattern == AARCH64_SV_ALL
17207               && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
17208             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
17209                       total_bytes / GET_MODE_SIZE (info.elt_mode));
17210           else
17211             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
17212                       svpattern_token (info.u.pattern));
17213         }
17214       return buf;
17215     }
17216
17217   if (info.insn == simd_immediate_info::INDEX)
17218     {
17219       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
17220                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
17221                 element_char, INTVAL (info.u.index.base),
17222                 INTVAL (info.u.index.step));
17223       return templ;
17224     }
17225
17226   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
17227     {
17228       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
17229         info.u.mov.value = GEN_INT (0);
17230       else
17231         {
17232           const int buf_size = 20;
17233           char float_buf[buf_size] = {};
17234           real_to_decimal_for_mode (float_buf,
17235                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17236                                     buf_size, buf_size, 1, info.elt_mode);
17237
17238           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
17239                     element_char, float_buf);
17240           return templ;
17241         }
17242     }
17243
17244   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
17245             element_char, INTVAL (info.u.mov.value));
17246   return templ;
17247 }
17248
17249 /* Split operands into moves from op[1] + op[2] into op[0].  */
17250
17251 void
17252 aarch64_split_combinev16qi (rtx operands[3])
17253 {
17254   unsigned int dest = REGNO (operands[0]);
17255   unsigned int src1 = REGNO (operands[1]);
17256   unsigned int src2 = REGNO (operands[2]);
17257   machine_mode halfmode = GET_MODE (operands[1]);
17258   unsigned int halfregs = REG_NREGS (operands[1]);
17259   rtx destlo, desthi;
17260
17261   gcc_assert (halfmode == V16QImode);
17262
17263   if (src1 == dest && src2 == dest + halfregs)
17264     {
17265       /* No-op move.  Can't split to nothing; emit something.  */
17266       emit_note (NOTE_INSN_DELETED);
17267       return;
17268     }
17269
17270   /* Preserve register attributes for variable tracking.  */
17271   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
17272   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
17273                                GET_MODE_SIZE (halfmode));
17274
17275   /* Special case of reversed high/low parts.  */
17276   if (reg_overlap_mentioned_p (operands[2], destlo)
17277       && reg_overlap_mentioned_p (operands[1], desthi))
17278     {
17279       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17280       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
17281       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17282     }
17283   else if (!reg_overlap_mentioned_p (operands[2], destlo))
17284     {
17285       /* Try to avoid unnecessary moves if part of the result
17286          is in the right place already.  */
17287       if (src1 != dest)
17288         emit_move_insn (destlo, operands[1]);
17289       if (src2 != dest + halfregs)
17290         emit_move_insn (desthi, operands[2]);
17291     }
17292   else
17293     {
17294       if (src2 != dest + halfregs)
17295         emit_move_insn (desthi, operands[2]);
17296       if (src1 != dest)
17297         emit_move_insn (destlo, operands[1]);
17298     }
17299 }
17300
17301 /* vec_perm support.  */
17302
17303 struct expand_vec_perm_d
17304 {
17305   rtx target, op0, op1;
17306   vec_perm_indices perm;
17307   machine_mode vmode;
17308   unsigned int vec_flags;
17309   bool one_vector_p;
17310   bool testing_p;
17311 };
17312
17313 /* Generate a variable permutation.  */
17314
17315 static void
17316 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
17317 {
17318   machine_mode vmode = GET_MODE (target);
17319   bool one_vector_p = rtx_equal_p (op0, op1);
17320
17321   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
17322   gcc_checking_assert (GET_MODE (op0) == vmode);
17323   gcc_checking_assert (GET_MODE (op1) == vmode);
17324   gcc_checking_assert (GET_MODE (sel) == vmode);
17325   gcc_checking_assert (TARGET_SIMD);
17326
17327   if (one_vector_p)
17328     {
17329       if (vmode == V8QImode)
17330         {
17331           /* Expand the argument to a V16QI mode by duplicating it.  */
17332           rtx pair = gen_reg_rtx (V16QImode);
17333           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
17334           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17335         }
17336       else
17337         {
17338           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
17339         }
17340     }
17341   else
17342     {
17343       rtx pair;
17344
17345       if (vmode == V8QImode)
17346         {
17347           pair = gen_reg_rtx (V16QImode);
17348           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
17349           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17350         }
17351       else
17352         {
17353           pair = gen_reg_rtx (OImode);
17354           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
17355           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
17356         }
17357     }
17358 }
17359
17360 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
17361    NELT is the number of elements in the vector.  */
17362
17363 void
17364 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
17365                          unsigned int nelt)
17366 {
17367   machine_mode vmode = GET_MODE (target);
17368   bool one_vector_p = rtx_equal_p (op0, op1);
17369   rtx mask;
17370
17371   /* The TBL instruction does not use a modulo index, so we must take care
17372      of that ourselves.  */
17373   mask = aarch64_simd_gen_const_vector_dup (vmode,
17374       one_vector_p ? nelt - 1 : 2 * nelt - 1);
17375   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
17376
17377   /* For big-endian, we also need to reverse the index within the vector
17378      (but not which vector).  */
17379   if (BYTES_BIG_ENDIAN)
17380     {
17381       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
17382       if (!one_vector_p)
17383         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
17384       sel = expand_simple_binop (vmode, XOR, sel, mask,
17385                                  NULL, 0, OPTAB_LIB_WIDEN);
17386     }
17387   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
17388 }
17389
17390 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
17391
17392 static void
17393 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
17394 {
17395   emit_insn (gen_rtx_SET (target,
17396                           gen_rtx_UNSPEC (GET_MODE (target),
17397                                           gen_rtvec (2, op0, op1), code)));
17398 }
17399
17400 /* Expand an SVE vec_perm with the given operands.  */
17401
17402 void
17403 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
17404 {
17405   machine_mode data_mode = GET_MODE (target);
17406   machine_mode sel_mode = GET_MODE (sel);
17407   /* Enforced by the pattern condition.  */
17408   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
17409
17410   /* Note: vec_perm indices are supposed to wrap when they go beyond the
17411      size of the two value vectors, i.e. the upper bits of the indices
17412      are effectively ignored.  SVE TBL instead produces 0 for any
17413      out-of-range indices, so we need to modulo all the vec_perm indices
17414      to ensure they are all in range.  */
17415   rtx sel_reg = force_reg (sel_mode, sel);
17416
17417   /* Check if the sel only references the first values vector.  */
17418   if (GET_CODE (sel) == CONST_VECTOR
17419       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
17420     {
17421       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
17422       return;
17423     }
17424
17425   /* Check if the two values vectors are the same.  */
17426   if (rtx_equal_p (op0, op1))
17427     {
17428       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
17429       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17430                                          NULL, 0, OPTAB_DIRECT);
17431       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
17432       return;
17433     }
17434
17435   /* Run TBL on for each value vector and combine the results.  */
17436
17437   rtx res0 = gen_reg_rtx (data_mode);
17438   rtx res1 = gen_reg_rtx (data_mode);
17439   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
17440   if (GET_CODE (sel) != CONST_VECTOR
17441       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
17442     {
17443       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
17444                                                        2 * nunits - 1);
17445       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17446                                      NULL, 0, OPTAB_DIRECT);
17447     }
17448   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
17449   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
17450                                      NULL, 0, OPTAB_DIRECT);
17451   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
17452   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
17453     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
17454   else
17455     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
17456 }
17457
17458 /* Recognize patterns suitable for the TRN instructions.  */
17459 static bool
17460 aarch64_evpc_trn (struct expand_vec_perm_d *d)
17461 {
17462   HOST_WIDE_INT odd;
17463   poly_uint64 nelt = d->perm.length ();
17464   rtx out, in0, in1, x;
17465   machine_mode vmode = d->vmode;
17466
17467   if (GET_MODE_UNIT_SIZE (vmode) > 8)
17468     return false;
17469
17470   /* Note that these are little-endian tests.
17471      We correct for big-endian later.  */
17472   if (!d->perm[0].is_constant (&odd)
17473       || (odd != 0 && odd != 1)
17474       || !d->perm.series_p (0, 2, odd, 2)
17475       || !d->perm.series_p (1, 2, nelt + odd, 2))
17476     return false;
17477
17478   /* Success!  */
17479   if (d->testing_p)
17480     return true;
17481
17482   in0 = d->op0;
17483   in1 = d->op1;
17484   /* We don't need a big-endian lane correction for SVE; see the comment
17485      at the head of aarch64-sve.md for details.  */
17486   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17487     {
17488       x = in0, in0 = in1, in1 = x;
17489       odd = !odd;
17490     }
17491   out = d->target;
17492
17493   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17494                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
17495   return true;
17496 }
17497
17498 /* Recognize patterns suitable for the UZP instructions.  */
17499 static bool
17500 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
17501 {
17502   HOST_WIDE_INT odd;
17503   rtx out, in0, in1, x;
17504   machine_mode vmode = d->vmode;
17505
17506   if (GET_MODE_UNIT_SIZE (vmode) > 8)
17507     return false;
17508
17509   /* Note that these are little-endian tests.
17510      We correct for big-endian later.  */
17511   if (!d->perm[0].is_constant (&odd)
17512       || (odd != 0 && odd != 1)
17513       || !d->perm.series_p (0, 1, odd, 2))
17514     return false;
17515
17516   /* Success!  */
17517   if (d->testing_p)
17518     return true;
17519
17520   in0 = d->op0;
17521   in1 = d->op1;
17522   /* We don't need a big-endian lane correction for SVE; see the comment
17523      at the head of aarch64-sve.md for details.  */
17524   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17525     {
17526       x = in0, in0 = in1, in1 = x;
17527       odd = !odd;
17528     }
17529   out = d->target;
17530
17531   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17532                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
17533   return true;
17534 }
17535
17536 /* Recognize patterns suitable for the ZIP instructions.  */
17537 static bool
17538 aarch64_evpc_zip (struct expand_vec_perm_d *d)
17539 {
17540   unsigned int high;
17541   poly_uint64 nelt = d->perm.length ();
17542   rtx out, in0, in1, x;
17543   machine_mode vmode = d->vmode;
17544
17545   if (GET_MODE_UNIT_SIZE (vmode) > 8)
17546     return false;
17547
17548   /* Note that these are little-endian tests.
17549      We correct for big-endian later.  */
17550   poly_uint64 first = d->perm[0];
17551   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
17552       || !d->perm.series_p (0, 2, first, 1)
17553       || !d->perm.series_p (1, 2, first + nelt, 1))
17554     return false;
17555   high = maybe_ne (first, 0U);
17556
17557   /* Success!  */
17558   if (d->testing_p)
17559     return true;
17560
17561   in0 = d->op0;
17562   in1 = d->op1;
17563   /* We don't need a big-endian lane correction for SVE; see the comment
17564      at the head of aarch64-sve.md for details.  */
17565   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17566     {
17567       x = in0, in0 = in1, in1 = x;
17568       high = !high;
17569     }
17570   out = d->target;
17571
17572   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17573                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
17574   return true;
17575 }
17576
17577 /* Recognize patterns for the EXT insn.  */
17578
17579 static bool
17580 aarch64_evpc_ext (struct expand_vec_perm_d *d)
17581 {
17582   HOST_WIDE_INT location;
17583   rtx offset;
17584
17585   /* The first element always refers to the first vector.
17586      Check if the extracted indices are increasing by one.  */
17587   if (d->vec_flags == VEC_SVE_PRED
17588       || !d->perm[0].is_constant (&location)
17589       || !d->perm.series_p (0, 1, location, 1))
17590     return false;
17591
17592   /* Success! */
17593   if (d->testing_p)
17594     return true;
17595
17596   /* The case where (location == 0) is a no-op for both big- and little-endian,
17597      and is removed by the mid-end at optimization levels -O1 and higher.
17598
17599      We don't need a big-endian lane correction for SVE; see the comment
17600      at the head of aarch64-sve.md for details.  */
17601   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
17602     {
17603       /* After setup, we want the high elements of the first vector (stored
17604          at the LSB end of the register), and the low elements of the second
17605          vector (stored at the MSB end of the register). So swap.  */
17606       std::swap (d->op0, d->op1);
17607       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
17608          to_constant () is safe since this is restricted to Advanced SIMD
17609          vectors.  */
17610       location = d->perm.length ().to_constant () - location;
17611     }
17612
17613   offset = GEN_INT (location);
17614   emit_set_insn (d->target,
17615                  gen_rtx_UNSPEC (d->vmode,
17616                                  gen_rtvec (3, d->op0, d->op1, offset),
17617                                  UNSPEC_EXT));
17618   return true;
17619 }
17620
17621 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
17622    within each 64-bit, 32-bit or 16-bit granule.  */
17623
17624 static bool
17625 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
17626 {
17627   HOST_WIDE_INT diff;
17628   unsigned int i, size, unspec;
17629   machine_mode pred_mode;
17630
17631   if (d->vec_flags == VEC_SVE_PRED
17632       || !d->one_vector_p
17633       || !d->perm[0].is_constant (&diff))
17634     return false;
17635
17636   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
17637   if (size == 8)
17638     {
17639       unspec = UNSPEC_REV64;
17640       pred_mode = VNx2BImode;
17641     }
17642   else if (size == 4)
17643     {
17644       unspec = UNSPEC_REV32;
17645       pred_mode = VNx4BImode;
17646     }
17647   else if (size == 2)
17648     {
17649       unspec = UNSPEC_REV16;
17650       pred_mode = VNx8BImode;
17651     }
17652   else
17653     return false;
17654
17655   unsigned int step = diff + 1;
17656   for (i = 0; i < step; ++i)
17657     if (!d->perm.series_p (i, step, diff - i, step))
17658       return false;
17659
17660   /* Success! */
17661   if (d->testing_p)
17662     return true;
17663
17664   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
17665   if (d->vec_flags == VEC_SVE_DATA)
17666     {
17667       rtx pred = aarch64_ptrue_reg (pred_mode);
17668       src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
17669                             UNSPEC_PRED_X);
17670     }
17671   emit_set_insn (d->target, src);
17672   return true;
17673 }
17674
17675 /* Recognize patterns for the REV insn, which reverses elements within
17676    a full vector.  */
17677
17678 static bool
17679 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
17680 {
17681   poly_uint64 nelt = d->perm.length ();
17682
17683   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
17684     return false;
17685
17686   if (!d->perm.series_p (0, 1, nelt - 1, -1))
17687     return false;
17688
17689   /* Success! */
17690   if (d->testing_p)
17691     return true;
17692
17693   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
17694   emit_set_insn (d->target, src);
17695   return true;
17696 }
17697
17698 static bool
17699 aarch64_evpc_dup (struct expand_vec_perm_d *d)
17700 {
17701   rtx out = d->target;
17702   rtx in0;
17703   HOST_WIDE_INT elt;
17704   machine_mode vmode = d->vmode;
17705   rtx lane;
17706
17707   if (d->vec_flags == VEC_SVE_PRED
17708       || d->perm.encoding ().encoded_nelts () != 1
17709       || !d->perm[0].is_constant (&elt))
17710     return false;
17711
17712   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
17713     return false;
17714
17715   /* Success! */
17716   if (d->testing_p)
17717     return true;
17718
17719   /* The generic preparation in aarch64_expand_vec_perm_const_1
17720      swaps the operand order and the permute indices if it finds
17721      d->perm[0] to be in the second operand.  Thus, we can always
17722      use d->op0 and need not do any extra arithmetic to get the
17723      correct lane number.  */
17724   in0 = d->op0;
17725   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
17726
17727   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
17728   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
17729   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
17730   return true;
17731 }
17732
17733 static bool
17734 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
17735 {
17736   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
17737   machine_mode vmode = d->vmode;
17738
17739   /* Make sure that the indices are constant.  */
17740   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
17741   for (unsigned int i = 0; i < encoded_nelts; ++i)
17742     if (!d->perm[i].is_constant ())
17743       return false;
17744
17745   if (d->testing_p)
17746     return true;
17747
17748   /* Generic code will try constant permutation twice.  Once with the
17749      original mode and again with the elements lowered to QImode.
17750      So wait and don't do the selector expansion ourselves.  */
17751   if (vmode != V8QImode && vmode != V16QImode)
17752     return false;
17753
17754   /* to_constant is safe since this routine is specific to Advanced SIMD
17755      vectors.  */
17756   unsigned int nelt = d->perm.length ().to_constant ();
17757   for (unsigned int i = 0; i < nelt; ++i)
17758     /* If big-endian and two vectors we end up with a weird mixed-endian
17759        mode on NEON.  Reverse the index within each word but not the word
17760        itself.  to_constant is safe because we checked is_constant above.  */
17761     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
17762                         ? d->perm[i].to_constant () ^ (nelt - 1)
17763                         : d->perm[i].to_constant ());
17764
17765   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
17766   sel = force_reg (vmode, sel);
17767
17768   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
17769   return true;
17770 }
17771
17772 /* Try to implement D using an SVE TBL instruction.  */
17773
17774 static bool
17775 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
17776 {
17777   unsigned HOST_WIDE_INT nelt;
17778
17779   /* Permuting two variable-length vectors could overflow the
17780      index range.  */
17781   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
17782     return false;
17783
17784   if (d->testing_p)
17785     return true;
17786
17787   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
17788   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
17789   if (d->one_vector_p)
17790     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
17791   else
17792     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
17793   return true;
17794 }
17795
17796 static bool
17797 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
17798 {
17799   /* The pattern matching functions above are written to look for a small
17800      number to begin the sequence (0, 1, N/2).  If we begin with an index
17801      from the second operand, we can swap the operands.  */
17802   poly_int64 nelt = d->perm.length ();
17803   if (known_ge (d->perm[0], nelt))
17804     {
17805       d->perm.rotate_inputs (1);
17806       std::swap (d->op0, d->op1);
17807     }
17808
17809   if ((d->vec_flags == VEC_ADVSIMD
17810        || d->vec_flags == VEC_SVE_DATA
17811        || d->vec_flags == VEC_SVE_PRED)
17812       && known_gt (nelt, 1))
17813     {
17814       if (aarch64_evpc_rev_local (d))
17815         return true;
17816       else if (aarch64_evpc_rev_global (d))
17817         return true;
17818       else if (aarch64_evpc_ext (d))
17819         return true;
17820       else if (aarch64_evpc_dup (d))
17821         return true;
17822       else if (aarch64_evpc_zip (d))
17823         return true;
17824       else if (aarch64_evpc_uzp (d))
17825         return true;
17826       else if (aarch64_evpc_trn (d))
17827         return true;
17828       if (d->vec_flags == VEC_SVE_DATA)
17829         return aarch64_evpc_sve_tbl (d);
17830       else if (d->vec_flags == VEC_ADVSIMD)
17831         return aarch64_evpc_tbl (d);
17832     }
17833   return false;
17834 }
17835
17836 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
17837
17838 static bool
17839 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
17840                                   rtx op1, const vec_perm_indices &sel)
17841 {
17842   struct expand_vec_perm_d d;
17843
17844   /* Check whether the mask can be applied to a single vector.  */
17845   if (sel.ninputs () == 1
17846       || (op0 && rtx_equal_p (op0, op1)))
17847     d.one_vector_p = true;
17848   else if (sel.all_from_input_p (0))
17849     {
17850       d.one_vector_p = true;
17851       op1 = op0;
17852     }
17853   else if (sel.all_from_input_p (1))
17854     {
17855       d.one_vector_p = true;
17856       op0 = op1;
17857     }
17858   else
17859     d.one_vector_p = false;
17860
17861   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
17862                      sel.nelts_per_input ());
17863   d.vmode = vmode;
17864   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
17865   d.target = target;
17866   d.op0 = op0;
17867   d.op1 = op1;
17868   d.testing_p = !target;
17869
17870   if (!d.testing_p)
17871     return aarch64_expand_vec_perm_const_1 (&d);
17872
17873   rtx_insn *last = get_last_insn ();
17874   bool ret = aarch64_expand_vec_perm_const_1 (&d);
17875   gcc_assert (last == get_last_insn ());
17876
17877   return ret;
17878 }
17879
17880 /* Generate a byte permute mask for a register of mode MODE,
17881    which has NUNITS units.  */
17882
17883 rtx
17884 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
17885 {
17886   /* We have to reverse each vector because we dont have
17887      a permuted load that can reverse-load according to ABI rules.  */
17888   rtx mask;
17889   rtvec v = rtvec_alloc (16);
17890   unsigned int i, j;
17891   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
17892
17893   gcc_assert (BYTES_BIG_ENDIAN);
17894   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
17895
17896   for (i = 0; i < nunits; i++)
17897     for (j = 0; j < usize; j++)
17898       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
17899   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
17900   return force_reg (V16QImode, mask);
17901 }
17902
17903 /* Expand an SVE integer comparison using the SVE equivalent of:
17904
17905      (set TARGET (CODE OP0 OP1)).  */
17906
17907 void
17908 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
17909 {
17910   machine_mode pred_mode = GET_MODE (target);
17911   machine_mode data_mode = GET_MODE (op0);
17912   rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
17913                                       op0, op1);
17914   if (!rtx_equal_p (target, res))
17915     emit_move_insn (target, res);
17916 }
17917
17918 /* Return the UNSPEC_COND_* code for comparison CODE.  */
17919
17920 static unsigned int
17921 aarch64_unspec_cond_code (rtx_code code)
17922 {
17923   switch (code)
17924     {
17925     case NE:
17926       return UNSPEC_COND_FCMNE;
17927     case EQ:
17928       return UNSPEC_COND_FCMEQ;
17929     case LT:
17930       return UNSPEC_COND_FCMLT;
17931     case GT:
17932       return UNSPEC_COND_FCMGT;
17933     case LE:
17934       return UNSPEC_COND_FCMLE;
17935     case GE:
17936       return UNSPEC_COND_FCMGE;
17937     case UNORDERED:
17938       return UNSPEC_COND_FCMUO;
17939     default:
17940       gcc_unreachable ();
17941     }
17942 }
17943
17944 /* Emit:
17945
17946       (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
17947
17948    where <X> is the operation associated with comparison CODE.
17949    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
17950
17951 static void
17952 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
17953                           bool known_ptrue_p, rtx op0, rtx op1)
17954 {
17955   rtx flag = gen_int_mode (known_ptrue_p, SImode);
17956   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
17957                                gen_rtvec (4, pred, flag, op0, op1),
17958                                aarch64_unspec_cond_code (code));
17959   emit_set_insn (target, unspec);
17960 }
17961
17962 /* Emit the SVE equivalent of:
17963
17964       (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
17965       (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
17966       (set TARGET (ior:PRED_MODE TMP1 TMP2))
17967
17968    where <Xi> is the operation associated with comparison CODEi.
17969    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
17970
17971 static void
17972 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
17973                               rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
17974 {
17975   machine_mode pred_mode = GET_MODE (pred);
17976   rtx tmp1 = gen_reg_rtx (pred_mode);
17977   aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
17978   rtx tmp2 = gen_reg_rtx (pred_mode);
17979   aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
17980   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
17981 }
17982
17983 /* Emit the SVE equivalent of:
17984
17985       (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
17986       (set TARGET (not TMP))
17987
17988    where <X> is the operation associated with comparison CODE.
17989    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
17990
17991 static void
17992 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
17993                                  bool known_ptrue_p, rtx op0, rtx op1)
17994 {
17995   machine_mode pred_mode = GET_MODE (pred);
17996   rtx tmp = gen_reg_rtx (pred_mode);
17997   aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
17998   aarch64_emit_unop (target, one_cmpl_optab, tmp);
17999 }
18000
18001 /* Expand an SVE floating-point comparison using the SVE equivalent of:
18002
18003      (set TARGET (CODE OP0 OP1))
18004
18005    If CAN_INVERT_P is true, the caller can also handle inverted results;
18006    return true if the result is in fact inverted.  */
18007
18008 bool
18009 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
18010                                   rtx op0, rtx op1, bool can_invert_p)
18011 {
18012   machine_mode pred_mode = GET_MODE (target);
18013   machine_mode data_mode = GET_MODE (op0);
18014
18015   rtx ptrue = aarch64_ptrue_reg (pred_mode);
18016   switch (code)
18017     {
18018     case UNORDERED:
18019       /* UNORDERED has no immediate form.  */
18020       op1 = force_reg (data_mode, op1);
18021       /* fall through */
18022     case LT:
18023     case LE:
18024     case GT:
18025     case GE:
18026     case EQ:
18027     case NE:
18028       {
18029         /* There is native support for the comparison.  */
18030         aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
18031         return false;
18032       }
18033
18034     case LTGT:
18035       /* This is a trapping operation (LT or GT).  */
18036       aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
18037       return false;
18038
18039     case UNEQ:
18040       if (!flag_trapping_math)
18041         {
18042           /* This would trap for signaling NaNs.  */
18043           op1 = force_reg (data_mode, op1);
18044           aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
18045                                         ptrue, true, op0, op1);
18046           return false;
18047         }
18048       /* fall through */
18049     case UNLT:
18050     case UNLE:
18051     case UNGT:
18052     case UNGE:
18053       if (flag_trapping_math)
18054         {
18055           /* Work out which elements are ordered.  */
18056           rtx ordered = gen_reg_rtx (pred_mode);
18057           op1 = force_reg (data_mode, op1);
18058           aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
18059                                            ptrue, true, op0, op1);
18060
18061           /* Test the opposite condition for the ordered elements,
18062              then invert the result.  */
18063           if (code == UNEQ)
18064             code = NE;
18065           else
18066             code = reverse_condition_maybe_unordered (code);
18067           if (can_invert_p)
18068             {
18069               aarch64_emit_sve_fp_cond (target, code,
18070                                         ordered, false, op0, op1);
18071               return true;
18072             }
18073           aarch64_emit_sve_invert_fp_cond (target, code,
18074                                            ordered, false, op0, op1);
18075           return false;
18076         }
18077       break;
18078
18079     case ORDERED:
18080       /* ORDERED has no immediate form.  */
18081       op1 = force_reg (data_mode, op1);
18082       break;
18083
18084     default:
18085       gcc_unreachable ();
18086     }
18087
18088   /* There is native support for the inverse comparison.  */
18089   code = reverse_condition_maybe_unordered (code);
18090   if (can_invert_p)
18091     {
18092       aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
18093       return true;
18094     }
18095   aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
18096   return false;
18097 }
18098
18099 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
18100    of the data being selected and CMP_MODE is the mode of the values being
18101    compared.  */
18102
18103 void
18104 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
18105                           rtx *ops)
18106 {
18107   machine_mode pred_mode
18108     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
18109                              GET_MODE_SIZE (cmp_mode)).require ();
18110   rtx pred = gen_reg_rtx (pred_mode);
18111   if (FLOAT_MODE_P (cmp_mode))
18112     {
18113       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
18114                                             ops[4], ops[5], true))
18115         std::swap (ops[1], ops[2]);
18116     }
18117   else
18118     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
18119
18120   if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
18121     ops[1] = force_reg (data_mode, ops[1]);
18122   /* The "false" value can only be zero if the "true" value is a constant.  */
18123   if (register_operand (ops[1], data_mode)
18124       || !aarch64_simd_reg_or_zero (ops[2], data_mode))
18125     ops[2] = force_reg (data_mode, ops[2]);
18126
18127   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
18128   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
18129 }
18130
18131 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
18132    true.  However due to issues with register allocation it is preferable
18133    to avoid tieing integer scalar and FP scalar modes.  Executing integer
18134    operations in general registers is better than treating them as scalar
18135    vector operations.  This reduces latency and avoids redundant int<->FP
18136    moves.  So tie modes if they are either the same class, or vector modes
18137    with other vector modes, vector structs or any scalar mode.  */
18138
18139 static bool
18140 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
18141 {
18142   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
18143     return true;
18144
18145   /* We specifically want to allow elements of "structure" modes to
18146      be tieable to the structure.  This more general condition allows
18147      other rarer situations too.  The reason we don't extend this to
18148      predicate modes is that there are no predicate structure modes
18149      nor any specific instructions for extracting part of a predicate
18150      register.  */
18151   if (aarch64_vector_data_mode_p (mode1)
18152       && aarch64_vector_data_mode_p (mode2))
18153     return true;
18154
18155   /* Also allow any scalar modes with vectors.  */
18156   if (aarch64_vector_mode_supported_p (mode1)
18157       || aarch64_vector_mode_supported_p (mode2))
18158     return true;
18159
18160   return false;
18161 }
18162
18163 /* Return a new RTX holding the result of moving POINTER forward by
18164    AMOUNT bytes.  */
18165
18166 static rtx
18167 aarch64_move_pointer (rtx pointer, poly_int64 amount)
18168 {
18169   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
18170
18171   return adjust_automodify_address (pointer, GET_MODE (pointer),
18172                                     next, amount);
18173 }
18174
18175 /* Return a new RTX holding the result of moving POINTER forward by the
18176    size of the mode it points to.  */
18177
18178 static rtx
18179 aarch64_progress_pointer (rtx pointer)
18180 {
18181   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
18182 }
18183
18184 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
18185    MODE bytes.  */
18186
18187 static void
18188 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
18189                                               machine_mode mode)
18190 {
18191   rtx reg = gen_reg_rtx (mode);
18192
18193   /* "Cast" the pointers to the correct mode.  */
18194   *src = adjust_address (*src, mode, 0);
18195   *dst = adjust_address (*dst, mode, 0);
18196   /* Emit the memcpy.  */
18197   emit_move_insn (reg, *src);
18198   emit_move_insn (*dst, reg);
18199   /* Move the pointers forward.  */
18200   *src = aarch64_progress_pointer (*src);
18201   *dst = aarch64_progress_pointer (*dst);
18202 }
18203
18204 /* Expand cpymem, as if from a __builtin_memcpy.  Return true if
18205    we succeed, otherwise return false.  */
18206
18207 bool
18208 aarch64_expand_cpymem (rtx *operands)
18209 {
18210   int n, mode_bits;
18211   rtx dst = operands[0];
18212   rtx src = operands[1];
18213   rtx base;
18214   machine_mode cur_mode = BLKmode, next_mode;
18215   bool speed_p = !optimize_function_for_size_p (cfun);
18216
18217   /* When optimizing for size, give a better estimate of the length of a
18218      memcpy call, but use the default otherwise.  Moves larger than 8 bytes
18219      will always require an even number of instructions to do now.  And each
18220      operation requires both a load+store, so devide the max number by 2.  */
18221   int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
18222
18223   /* We can't do anything smart if the amount to copy is not constant.  */
18224   if (!CONST_INT_P (operands[2]))
18225     return false;
18226
18227   n = INTVAL (operands[2]);
18228
18229   /* Try to keep the number of instructions low.  For all cases we will do at
18230      most two moves for the residual amount, since we'll always overlap the
18231      remainder.  */
18232   if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
18233     return false;
18234
18235   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
18236   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
18237
18238   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
18239   src = adjust_automodify_address (src, VOIDmode, base, 0);
18240
18241   /* Convert n to bits to make the rest of the code simpler.  */
18242   n = n * BITS_PER_UNIT;
18243
18244   /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
18245      larger than TImode, but we should not use them for loads/stores here.  */
18246   const int copy_limit = GET_MODE_BITSIZE (TImode);
18247
18248   while (n > 0)
18249     {
18250       /* Find the largest mode in which to do the copy in without over reading
18251          or writing.  */
18252       opt_scalar_int_mode mode_iter;
18253       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
18254         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
18255           cur_mode = mode_iter.require ();
18256
18257       gcc_assert (cur_mode != BLKmode);
18258
18259       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
18260       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
18261
18262       n -= mode_bits;
18263
18264       /* Do certain trailing copies as overlapping if it's going to be
18265          cheaper.  i.e. less instructions to do so.  For instance doing a 15
18266          byte copy it's more efficient to do two overlapping 8 byte copies than
18267          8 + 6 + 1.  */
18268       if (n > 0 && n <= 8 * BITS_PER_UNIT)
18269         {
18270           next_mode = smallest_mode_for_size (n, MODE_INT);
18271           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
18272           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
18273           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
18274           n = n_bits;
18275         }
18276     }
18277
18278   return true;
18279 }
18280
18281 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
18282    SImode stores.  Handle the case when the constant has identical
18283    bottom and top halves.  This is beneficial when the two stores can be
18284    merged into an STP and we avoid synthesising potentially expensive
18285    immediates twice.  Return true if such a split is possible.  */
18286
18287 bool
18288 aarch64_split_dimode_const_store (rtx dst, rtx src)
18289 {
18290   rtx lo = gen_lowpart (SImode, src);
18291   rtx hi = gen_highpart_mode (SImode, DImode, src);
18292
18293   bool size_p = optimize_function_for_size_p (cfun);
18294
18295   if (!rtx_equal_p (lo, hi))
18296     return false;
18297
18298   unsigned int orig_cost
18299     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
18300   unsigned int lo_cost
18301     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
18302
18303   /* We want to transform:
18304      MOV        x1, 49370
18305      MOVK       x1, 0x140, lsl 16
18306      MOVK       x1, 0xc0da, lsl 32
18307      MOVK       x1, 0x140, lsl 48
18308      STR        x1, [x0]
18309    into:
18310      MOV        w1, 49370
18311      MOVK       w1, 0x140, lsl 16
18312      STP        w1, w1, [x0]
18313    So we want to perform this only when we save two instructions
18314    or more.  When optimizing for size, however, accept any code size
18315    savings we can.  */
18316   if (size_p && orig_cost <= lo_cost)
18317     return false;
18318
18319   if (!size_p
18320       && (orig_cost <= lo_cost + 1))
18321     return false;
18322
18323   rtx mem_lo = adjust_address (dst, SImode, 0);
18324   if (!aarch64_mem_pair_operand (mem_lo, SImode))
18325     return false;
18326
18327   rtx tmp_reg = gen_reg_rtx (SImode);
18328   aarch64_expand_mov_immediate (tmp_reg, lo);
18329   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
18330   /* Don't emit an explicit store pair as this may not be always profitable.
18331      Let the sched-fusion logic decide whether to merge them.  */
18332   emit_move_insn (mem_lo, tmp_reg);
18333   emit_move_insn (mem_hi, tmp_reg);
18334
18335   return true;
18336 }
18337
18338 /* Generate RTL for a conditional branch with rtx comparison CODE in
18339    mode CC_MODE.  The destination of the unlikely conditional branch
18340    is LABEL_REF.  */
18341
18342 void
18343 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
18344                               rtx label_ref)
18345 {
18346   rtx x;
18347   x = gen_rtx_fmt_ee (code, VOIDmode,
18348                       gen_rtx_REG (cc_mode, CC_REGNUM),
18349                       const0_rtx);
18350
18351   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18352                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
18353                             pc_rtx);
18354   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18355 }
18356
18357 /* Generate DImode scratch registers for 128-bit (TImode) addition.
18358
18359    OP1 represents the TImode destination operand 1
18360    OP2 represents the TImode destination operand 2
18361    LOW_DEST represents the low half (DImode) of TImode operand 0
18362    LOW_IN1 represents the low half (DImode) of TImode operand 1
18363    LOW_IN2 represents the low half (DImode) of TImode operand 2
18364    HIGH_DEST represents the high half (DImode) of TImode operand 0
18365    HIGH_IN1 represents the high half (DImode) of TImode operand 1
18366    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
18367
18368 void
18369 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18370                             rtx *low_in1, rtx *low_in2,
18371                             rtx *high_dest, rtx *high_in1,
18372                             rtx *high_in2)
18373 {
18374   *low_dest = gen_reg_rtx (DImode);
18375   *low_in1 = gen_lowpart (DImode, op1);
18376   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18377                                   subreg_lowpart_offset (DImode, TImode));
18378   *high_dest = gen_reg_rtx (DImode);
18379   *high_in1 = gen_highpart (DImode, op1);
18380   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18381                                    subreg_highpart_offset (DImode, TImode));
18382 }
18383
18384 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
18385
18386    This function differs from 'arch64_addti_scratch_regs' in that
18387    OP1 can be an immediate constant (zero). We must call
18388    subreg_highpart_offset with DImode and TImode arguments, otherwise
18389    VOIDmode will be used for the const_int which generates an internal
18390    error from subreg_size_highpart_offset which does not expect a size of zero.
18391
18392    OP1 represents the TImode destination operand 1
18393    OP2 represents the TImode destination operand 2
18394    LOW_DEST represents the low half (DImode) of TImode operand 0
18395    LOW_IN1 represents the low half (DImode) of TImode operand 1
18396    LOW_IN2 represents the low half (DImode) of TImode operand 2
18397    HIGH_DEST represents the high half (DImode) of TImode operand 0
18398    HIGH_IN1 represents the high half (DImode) of TImode operand 1
18399    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
18400
18401
18402 void
18403 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18404                              rtx *low_in1, rtx *low_in2,
18405                              rtx *high_dest, rtx *high_in1,
18406                              rtx *high_in2)
18407 {
18408   *low_dest = gen_reg_rtx (DImode);
18409   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
18410                                   subreg_lowpart_offset (DImode, TImode));
18411
18412   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18413                                   subreg_lowpart_offset (DImode, TImode));
18414   *high_dest = gen_reg_rtx (DImode);
18415
18416   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
18417                                    subreg_highpart_offset (DImode, TImode));
18418   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18419                                    subreg_highpart_offset (DImode, TImode));
18420 }
18421
18422 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
18423
18424    OP0 represents the TImode destination operand 0
18425    LOW_DEST represents the low half (DImode) of TImode operand 0
18426    LOW_IN1 represents the low half (DImode) of TImode operand 1
18427    LOW_IN2 represents the low half (DImode) of TImode operand 2
18428    HIGH_DEST represents the high half (DImode) of TImode operand 0
18429    HIGH_IN1 represents the high half (DImode) of TImode operand 1
18430    HIGH_IN2 represents the high half (DImode) of TImode operand 2
18431    UNSIGNED_P is true if the operation is being performed on unsigned
18432    values.  */
18433 void
18434 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
18435                        rtx low_in2, rtx high_dest, rtx high_in1,
18436                        rtx high_in2, bool unsigned_p)
18437 {
18438   if (low_in2 == const0_rtx)
18439     {
18440       low_dest = low_in1;
18441       high_in2 = force_reg (DImode, high_in2);
18442       if (unsigned_p)
18443         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
18444       else
18445         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
18446     }
18447   else
18448     {
18449       if (CONST_INT_P (low_in2))
18450         {
18451           high_in2 = force_reg (DImode, high_in2);
18452           emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
18453                                               GEN_INT (-INTVAL (low_in2))));
18454         }
18455       else
18456         emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
18457
18458       if (unsigned_p)
18459         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
18460       else
18461         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
18462     }
18463
18464   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
18465   emit_move_insn (gen_highpart (DImode, op0), high_dest);
18466
18467 }
18468
18469 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
18470
18471 static unsigned HOST_WIDE_INT
18472 aarch64_asan_shadow_offset (void)
18473 {
18474   if (TARGET_ILP32)
18475     return (HOST_WIDE_INT_1 << 29);
18476   else
18477     return (HOST_WIDE_INT_1 << 36);
18478 }
18479
18480 static rtx
18481 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
18482                         int code, tree treeop0, tree treeop1)
18483 {
18484   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18485   rtx op0, op1;
18486   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18487   insn_code icode;
18488   struct expand_operand ops[4];
18489
18490   start_sequence ();
18491   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18492
18493   op_mode = GET_MODE (op0);
18494   if (op_mode == VOIDmode)
18495     op_mode = GET_MODE (op1);
18496
18497   switch (op_mode)
18498     {
18499     case E_QImode:
18500     case E_HImode:
18501     case E_SImode:
18502       cmp_mode = SImode;
18503       icode = CODE_FOR_cmpsi;
18504       break;
18505
18506     case E_DImode:
18507       cmp_mode = DImode;
18508       icode = CODE_FOR_cmpdi;
18509       break;
18510
18511     case E_SFmode:
18512       cmp_mode = SFmode;
18513       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18514       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
18515       break;
18516
18517     case E_DFmode:
18518       cmp_mode = DFmode;
18519       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18520       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
18521       break;
18522
18523     default:
18524       end_sequence ();
18525       return NULL_RTX;
18526     }
18527
18528   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
18529   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
18530   if (!op0 || !op1)
18531     {
18532       end_sequence ();
18533       return NULL_RTX;
18534     }
18535   *prep_seq = get_insns ();
18536   end_sequence ();
18537
18538   create_fixed_operand (&ops[0], op0);
18539   create_fixed_operand (&ops[1], op1);
18540
18541   start_sequence ();
18542   if (!maybe_expand_insn (icode, 2, ops))
18543     {
18544       end_sequence ();
18545       return NULL_RTX;
18546     }
18547   *gen_seq = get_insns ();
18548   end_sequence ();
18549
18550   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
18551                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
18552 }
18553
18554 static rtx
18555 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
18556                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
18557 {
18558   rtx op0, op1, target;
18559   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18560   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18561   insn_code icode;
18562   struct expand_operand ops[6];
18563   int aarch64_cond;
18564
18565   push_to_sequence (*prep_seq);
18566   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18567
18568   op_mode = GET_MODE (op0);
18569   if (op_mode == VOIDmode)
18570     op_mode = GET_MODE (op1);
18571
18572   switch (op_mode)
18573     {
18574     case E_QImode:
18575     case E_HImode:
18576     case E_SImode:
18577       cmp_mode = SImode;
18578       icode = CODE_FOR_ccmpsi;
18579       break;
18580
18581     case E_DImode:
18582       cmp_mode = DImode;
18583       icode = CODE_FOR_ccmpdi;
18584       break;
18585
18586     case E_SFmode:
18587       cmp_mode = SFmode;
18588       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
18589       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
18590       break;
18591
18592     case E_DFmode:
18593       cmp_mode = DFmode;
18594       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
18595       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
18596       break;
18597
18598     default:
18599       end_sequence ();
18600       return NULL_RTX;
18601     }
18602
18603   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
18604   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
18605   if (!op0 || !op1)
18606     {
18607       end_sequence ();
18608       return NULL_RTX;
18609     }
18610   *prep_seq = get_insns ();
18611   end_sequence ();
18612
18613   target = gen_rtx_REG (cc_mode, CC_REGNUM);
18614   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
18615
18616   if (bit_code != AND)
18617     {
18618       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
18619                                                 GET_MODE (XEXP (prev, 0))),
18620                              VOIDmode, XEXP (prev, 0), const0_rtx);
18621       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
18622     }
18623
18624   create_fixed_operand (&ops[0], XEXP (prev, 0));
18625   create_fixed_operand (&ops[1], target);
18626   create_fixed_operand (&ops[2], op0);
18627   create_fixed_operand (&ops[3], op1);
18628   create_fixed_operand (&ops[4], prev);
18629   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
18630
18631   push_to_sequence (*gen_seq);
18632   if (!maybe_expand_insn (icode, 6, ops))
18633     {
18634       end_sequence ();
18635       return NULL_RTX;
18636     }
18637
18638   *gen_seq = get_insns ();
18639   end_sequence ();
18640
18641   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
18642 }
18643
18644 #undef TARGET_GEN_CCMP_FIRST
18645 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
18646
18647 #undef TARGET_GEN_CCMP_NEXT
18648 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
18649
18650 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
18651    instruction fusion of some sort.  */
18652
18653 static bool
18654 aarch64_macro_fusion_p (void)
18655 {
18656   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
18657 }
18658
18659
18660 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
18661    should be kept together during scheduling.  */
18662
18663 static bool
18664 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
18665 {
18666   rtx set_dest;
18667   rtx prev_set = single_set (prev);
18668   rtx curr_set = single_set (curr);
18669   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
18670   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
18671
18672   if (!aarch64_macro_fusion_p ())
18673     return false;
18674
18675   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
18676     {
18677       /* We are trying to match:
18678          prev (mov)  == (set (reg r0) (const_int imm16))
18679          curr (movk) == (set (zero_extract (reg r0)
18680                                            (const_int 16)
18681                                            (const_int 16))
18682                              (const_int imm16_1))  */
18683
18684       set_dest = SET_DEST (curr_set);
18685
18686       if (GET_CODE (set_dest) == ZERO_EXTRACT
18687           && CONST_INT_P (SET_SRC (curr_set))
18688           && CONST_INT_P (SET_SRC (prev_set))
18689           && CONST_INT_P (XEXP (set_dest, 2))
18690           && INTVAL (XEXP (set_dest, 2)) == 16
18691           && REG_P (XEXP (set_dest, 0))
18692           && REG_P (SET_DEST (prev_set))
18693           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
18694         {
18695           return true;
18696         }
18697     }
18698
18699   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
18700     {
18701
18702       /*  We're trying to match:
18703           prev (adrp) == (set (reg r1)
18704                               (high (symbol_ref ("SYM"))))
18705           curr (add) == (set (reg r0)
18706                              (lo_sum (reg r1)
18707                                      (symbol_ref ("SYM"))))
18708           Note that r0 need not necessarily be the same as r1, especially
18709           during pre-regalloc scheduling.  */
18710
18711       if (satisfies_constraint_Ush (SET_SRC (prev_set))
18712           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
18713         {
18714           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
18715               && REG_P (XEXP (SET_SRC (curr_set), 0))
18716               && REGNO (XEXP (SET_SRC (curr_set), 0))
18717                  == REGNO (SET_DEST (prev_set))
18718               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
18719                               XEXP (SET_SRC (curr_set), 1)))
18720             return true;
18721         }
18722     }
18723
18724   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
18725     {
18726
18727       /* We're trying to match:
18728          prev (movk) == (set (zero_extract (reg r0)
18729                                            (const_int 16)
18730                                            (const_int 32))
18731                              (const_int imm16_1))
18732          curr (movk) == (set (zero_extract (reg r0)
18733                                            (const_int 16)
18734                                            (const_int 48))
18735                              (const_int imm16_2))  */
18736
18737       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
18738           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
18739           && REG_P (XEXP (SET_DEST (prev_set), 0))
18740           && REG_P (XEXP (SET_DEST (curr_set), 0))
18741           && REGNO (XEXP (SET_DEST (prev_set), 0))
18742              == REGNO (XEXP (SET_DEST (curr_set), 0))
18743           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
18744           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
18745           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
18746           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
18747           && CONST_INT_P (SET_SRC (prev_set))
18748           && CONST_INT_P (SET_SRC (curr_set)))
18749         return true;
18750
18751     }
18752   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
18753     {
18754       /* We're trying to match:
18755           prev (adrp) == (set (reg r0)
18756                               (high (symbol_ref ("SYM"))))
18757           curr (ldr) == (set (reg r1)
18758                              (mem (lo_sum (reg r0)
18759                                              (symbol_ref ("SYM")))))
18760                  or
18761           curr (ldr) == (set (reg r1)
18762                              (zero_extend (mem
18763                                            (lo_sum (reg r0)
18764                                                    (symbol_ref ("SYM"))))))  */
18765       if (satisfies_constraint_Ush (SET_SRC (prev_set))
18766           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
18767         {
18768           rtx curr_src = SET_SRC (curr_set);
18769
18770           if (GET_CODE (curr_src) == ZERO_EXTEND)
18771             curr_src = XEXP (curr_src, 0);
18772
18773           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
18774               && REG_P (XEXP (XEXP (curr_src, 0), 0))
18775               && REGNO (XEXP (XEXP (curr_src, 0), 0))
18776                  == REGNO (SET_DEST (prev_set))
18777               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
18778                               XEXP (SET_SRC (prev_set), 0)))
18779               return true;
18780         }
18781     }
18782
18783   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
18784       && any_condjump_p (curr))
18785     {
18786       unsigned int condreg1, condreg2;
18787       rtx cc_reg_1;
18788       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
18789       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
18790
18791       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
18792           && prev
18793           && modified_in_p (cc_reg_1, prev))
18794         {
18795           enum attr_type prev_type = get_attr_type (prev);
18796
18797           /* FIXME: this misses some which is considered simple arthematic
18798              instructions for ThunderX.  Simple shifts are missed here.  */
18799           if (prev_type == TYPE_ALUS_SREG
18800               || prev_type == TYPE_ALUS_IMM
18801               || prev_type == TYPE_LOGICS_REG
18802               || prev_type == TYPE_LOGICS_IMM)
18803             return true;
18804         }
18805     }
18806
18807   if (prev_set
18808       && curr_set
18809       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
18810       && any_condjump_p (curr))
18811     {
18812       /* We're trying to match:
18813           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
18814           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
18815                                                          (const_int 0))
18816                                                  (label_ref ("SYM"))
18817                                                  (pc))  */
18818       if (SET_DEST (curr_set) == (pc_rtx)
18819           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
18820           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
18821           && REG_P (SET_DEST (prev_set))
18822           && REGNO (SET_DEST (prev_set))
18823              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
18824         {
18825           /* Fuse ALU operations followed by conditional branch instruction.  */
18826           switch (get_attr_type (prev))
18827             {
18828             case TYPE_ALU_IMM:
18829             case TYPE_ALU_SREG:
18830             case TYPE_ADC_REG:
18831             case TYPE_ADC_IMM:
18832             case TYPE_ADCS_REG:
18833             case TYPE_ADCS_IMM:
18834             case TYPE_LOGIC_REG:
18835             case TYPE_LOGIC_IMM:
18836             case TYPE_CSEL:
18837             case TYPE_ADR:
18838             case TYPE_MOV_IMM:
18839             case TYPE_SHIFT_REG:
18840             case TYPE_SHIFT_IMM:
18841             case TYPE_BFM:
18842             case TYPE_RBIT:
18843             case TYPE_REV:
18844             case TYPE_EXTEND:
18845               return true;
18846
18847             default:;
18848             }
18849         }
18850     }
18851
18852   return false;
18853 }
18854
18855 /* Return true iff the instruction fusion described by OP is enabled.  */
18856
18857 bool
18858 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
18859 {
18860   return (aarch64_tune_params.fusible_ops & op) != 0;
18861 }
18862
18863 /* If MEM is in the form of [base+offset], extract the two parts
18864    of address and set to BASE and OFFSET, otherwise return false
18865    after clearing BASE and OFFSET.  */
18866
18867 bool
18868 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
18869 {
18870   rtx addr;
18871
18872   gcc_assert (MEM_P (mem));
18873
18874   addr = XEXP (mem, 0);
18875
18876   if (REG_P (addr))
18877     {
18878       *base = addr;
18879       *offset = const0_rtx;
18880       return true;
18881     }
18882
18883   if (GET_CODE (addr) == PLUS
18884       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
18885     {
18886       *base = XEXP (addr, 0);
18887       *offset = XEXP (addr, 1);
18888       return true;
18889     }
18890
18891   *base = NULL_RTX;
18892   *offset = NULL_RTX;
18893
18894   return false;
18895 }
18896
18897 /* Types for scheduling fusion.  */
18898 enum sched_fusion_type
18899 {
18900   SCHED_FUSION_NONE = 0,
18901   SCHED_FUSION_LD_SIGN_EXTEND,
18902   SCHED_FUSION_LD_ZERO_EXTEND,
18903   SCHED_FUSION_LD,
18904   SCHED_FUSION_ST,
18905   SCHED_FUSION_NUM
18906 };
18907
18908 /* If INSN is a load or store of address in the form of [base+offset],
18909    extract the two parts and set to BASE and OFFSET.  Return scheduling
18910    fusion type this INSN is.  */
18911
18912 static enum sched_fusion_type
18913 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
18914 {
18915   rtx x, dest, src;
18916   enum sched_fusion_type fusion = SCHED_FUSION_LD;
18917
18918   gcc_assert (INSN_P (insn));
18919   x = PATTERN (insn);
18920   if (GET_CODE (x) != SET)
18921     return SCHED_FUSION_NONE;
18922
18923   src = SET_SRC (x);
18924   dest = SET_DEST (x);
18925
18926   machine_mode dest_mode = GET_MODE (dest);
18927
18928   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
18929     return SCHED_FUSION_NONE;
18930
18931   if (GET_CODE (src) == SIGN_EXTEND)
18932     {
18933       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
18934       src = XEXP (src, 0);
18935       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
18936         return SCHED_FUSION_NONE;
18937     }
18938   else if (GET_CODE (src) == ZERO_EXTEND)
18939     {
18940       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
18941       src = XEXP (src, 0);
18942       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
18943         return SCHED_FUSION_NONE;
18944     }
18945
18946   if (GET_CODE (src) == MEM && REG_P (dest))
18947     extract_base_offset_in_addr (src, base, offset);
18948   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
18949     {
18950       fusion = SCHED_FUSION_ST;
18951       extract_base_offset_in_addr (dest, base, offset);
18952     }
18953   else
18954     return SCHED_FUSION_NONE;
18955
18956   if (*base == NULL_RTX || *offset == NULL_RTX)
18957     fusion = SCHED_FUSION_NONE;
18958
18959   return fusion;
18960 }
18961
18962 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
18963
18964    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
18965    and PRI are only calculated for these instructions.  For other instruction,
18966    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
18967    type instruction fusion can be added by returning different priorities.
18968
18969    It's important that irrelevant instructions get the largest FUSION_PRI.  */
18970
18971 static void
18972 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
18973                                int *fusion_pri, int *pri)
18974 {
18975   int tmp, off_val;
18976   rtx base, offset;
18977   enum sched_fusion_type fusion;
18978
18979   gcc_assert (INSN_P (insn));
18980
18981   tmp = max_pri - 1;
18982   fusion = fusion_load_store (insn, &base, &offset);
18983   if (fusion == SCHED_FUSION_NONE)
18984     {
18985       *pri = tmp;
18986       *fusion_pri = tmp;
18987       return;
18988     }
18989
18990   /* Set FUSION_PRI according to fusion type and base register.  */
18991   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
18992
18993   /* Calculate PRI.  */
18994   tmp /= 2;
18995
18996   /* INSN with smaller offset goes first.  */
18997   off_val = (int)(INTVAL (offset));
18998   if (off_val >= 0)
18999     tmp -= (off_val & 0xfffff);
19000   else
19001     tmp += ((- off_val) & 0xfffff);
19002
19003   *pri = tmp;
19004   return;
19005 }
19006
19007 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
19008    Adjust priority of sha1h instructions so they are scheduled before
19009    other SHA1 instructions.  */
19010
19011 static int
19012 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
19013 {
19014   rtx x = PATTERN (insn);
19015
19016   if (GET_CODE (x) == SET)
19017     {
19018       x = SET_SRC (x);
19019
19020       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
19021         return priority + 10;
19022     }
19023
19024   return priority;
19025 }
19026
19027 /* Given OPERANDS of consecutive load/store, check if we can merge
19028    them into ldp/stp.  LOAD is true if they are load instructions.
19029    MODE is the mode of memory operands.  */
19030
19031 bool
19032 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
19033                                 machine_mode mode)
19034 {
19035   HOST_WIDE_INT offval_1, offval_2, msize;
19036   enum reg_class rclass_1, rclass_2;
19037   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
19038
19039   if (load)
19040     {
19041       mem_1 = operands[1];
19042       mem_2 = operands[3];
19043       reg_1 = operands[0];
19044       reg_2 = operands[2];
19045       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
19046       if (REGNO (reg_1) == REGNO (reg_2))
19047         return false;
19048     }
19049   else
19050     {
19051       mem_1 = operands[0];
19052       mem_2 = operands[2];
19053       reg_1 = operands[1];
19054       reg_2 = operands[3];
19055     }
19056
19057   /* The mems cannot be volatile.  */
19058   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
19059     return false;
19060
19061   /* If we have SImode and slow unaligned ldp,
19062      check the alignment to be at least 8 byte. */
19063   if (mode == SImode
19064       && (aarch64_tune_params.extra_tuning_flags
19065           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19066       && !optimize_size
19067       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
19068     return false;
19069
19070   /* Check if the addresses are in the form of [base+offset].  */
19071   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
19072   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
19073     return false;
19074   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
19075   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
19076     return false;
19077
19078   /* Check if the bases are same.  */
19079   if (!rtx_equal_p (base_1, base_2))
19080     return false;
19081
19082   /* The operands must be of the same size.  */
19083   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
19084                          GET_MODE_SIZE (GET_MODE (mem_2))));
19085
19086   offval_1 = INTVAL (offset_1);
19087   offval_2 = INTVAL (offset_2);
19088   /* We should only be trying this for fixed-sized modes.  There is no
19089      SVE LDP/STP instruction.  */
19090   msize = GET_MODE_SIZE (mode).to_constant ();
19091   /* Check if the offsets are consecutive.  */
19092   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
19093     return false;
19094
19095   /* Check if the addresses are clobbered by load.  */
19096   if (load)
19097     {
19098       if (reg_mentioned_p (reg_1, mem_1))
19099         return false;
19100
19101       /* In increasing order, the last load can clobber the address.  */
19102       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
19103         return false;
19104     }
19105
19106   /* One of the memory accesses must be a mempair operand.
19107      If it is not the first one, they need to be swapped by the
19108      peephole.  */
19109   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
19110        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
19111     return false;
19112
19113   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
19114     rclass_1 = FP_REGS;
19115   else
19116     rclass_1 = GENERAL_REGS;
19117
19118   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
19119     rclass_2 = FP_REGS;
19120   else
19121     rclass_2 = GENERAL_REGS;
19122
19123   /* Check if the registers are of same class.  */
19124   if (rclass_1 != rclass_2)
19125     return false;
19126
19127   return true;
19128 }
19129
19130 /* Given OPERANDS of consecutive load/store that can be merged,
19131    swap them if they are not in ascending order.  */
19132 void
19133 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
19134 {
19135   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
19136   HOST_WIDE_INT offval_1, offval_2;
19137
19138   if (load)
19139     {
19140       mem_1 = operands[1];
19141       mem_2 = operands[3];
19142     }
19143   else
19144     {
19145       mem_1 = operands[0];
19146       mem_2 = operands[2];
19147     }
19148
19149   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
19150   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
19151
19152   offval_1 = INTVAL (offset_1);
19153   offval_2 = INTVAL (offset_2);
19154
19155   if (offval_1 > offval_2)
19156     {
19157       /* Irrespective of whether this is a load or a store,
19158          we do the same swap.  */
19159       std::swap (operands[0], operands[2]);
19160       std::swap (operands[1], operands[3]);
19161     }
19162 }
19163
19164 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
19165    comparison between the two.  */
19166 int
19167 aarch64_host_wide_int_compare (const void *x, const void *y)
19168 {
19169   return wi::cmps (* ((const HOST_WIDE_INT *) x),
19170                    * ((const HOST_WIDE_INT *) y));
19171 }
19172
19173 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
19174    other pointing to a REG rtx containing an offset, compare the offsets
19175    of the two pairs.
19176
19177    Return:
19178
19179         1 iff offset (X) > offset (Y)
19180         0 iff offset (X) == offset (Y)
19181         -1 iff offset (X) < offset (Y)  */
19182 int
19183 aarch64_ldrstr_offset_compare (const void *x, const void *y)
19184 {
19185   const rtx * operands_1 = (const rtx *) x;
19186   const rtx * operands_2 = (const rtx *) y;
19187   rtx mem_1, mem_2, base, offset_1, offset_2;
19188
19189   if (MEM_P (operands_1[0]))
19190     mem_1 = operands_1[0];
19191   else
19192     mem_1 = operands_1[1];
19193
19194   if (MEM_P (operands_2[0]))
19195     mem_2 = operands_2[0];
19196   else
19197     mem_2 = operands_2[1];
19198
19199   /* Extract the offsets.  */
19200   extract_base_offset_in_addr (mem_1, &base, &offset_1);
19201   extract_base_offset_in_addr (mem_2, &base, &offset_2);
19202
19203   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
19204
19205   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
19206 }
19207
19208 /* Given OPERANDS of consecutive load/store, check if we can merge
19209    them into ldp/stp by adjusting the offset.  LOAD is true if they
19210    are load instructions.  MODE is the mode of memory operands.
19211
19212    Given below consecutive stores:
19213
19214      str  w1, [xb, 0x100]
19215      str  w1, [xb, 0x104]
19216      str  w1, [xb, 0x108]
19217      str  w1, [xb, 0x10c]
19218
19219    Though the offsets are out of the range supported by stp, we can
19220    still pair them after adjusting the offset, like:
19221
19222      add  scratch, xb, 0x100
19223      stp  w1, w1, [scratch]
19224      stp  w1, w1, [scratch, 0x8]
19225
19226    The peephole patterns detecting this opportunity should guarantee
19227    the scratch register is avaliable.  */
19228
19229 bool
19230 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
19231                                        scalar_mode mode)
19232 {
19233   const int num_insns = 4;
19234   enum reg_class rclass;
19235   HOST_WIDE_INT offvals[num_insns], msize;
19236   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
19237
19238   if (load)
19239     {
19240       for (int i = 0; i < num_insns; i++)
19241         {
19242           reg[i] = operands[2 * i];
19243           mem[i] = operands[2 * i + 1];
19244
19245           gcc_assert (REG_P (reg[i]));
19246         }
19247
19248       /* Do not attempt to merge the loads if the loads clobber each other.  */
19249       for (int i = 0; i < 8; i += 2)
19250         for (int j = i + 2; j < 8; j += 2)
19251           if (reg_overlap_mentioned_p (operands[i], operands[j]))
19252             return false;
19253     }
19254   else
19255     for (int i = 0; i < num_insns; i++)
19256       {
19257         mem[i] = operands[2 * i];
19258         reg[i] = operands[2 * i + 1];
19259       }
19260
19261   /* Skip if memory operand is by itself valid for ldp/stp.  */
19262   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
19263     return false;
19264
19265   for (int i = 0; i < num_insns; i++)
19266     {
19267       /* The mems cannot be volatile.  */
19268       if (MEM_VOLATILE_P (mem[i]))
19269         return false;
19270
19271       /* Check if the addresses are in the form of [base+offset].  */
19272       extract_base_offset_in_addr (mem[i], base + i, offset + i);
19273       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
19274         return false;
19275     }
19276
19277   /* Check if the registers are of same class.  */
19278   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
19279     ? FP_REGS : GENERAL_REGS;
19280
19281   for (int i = 1; i < num_insns; i++)
19282     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
19283       {
19284         if (rclass != FP_REGS)
19285           return false;
19286       }
19287     else
19288       {
19289         if (rclass != GENERAL_REGS)
19290           return false;
19291       }
19292
19293   /* Only the last register in the order in which they occur
19294      may be clobbered by the load.  */
19295   if (rclass == GENERAL_REGS && load)
19296     for (int i = 0; i < num_insns - 1; i++)
19297       if (reg_mentioned_p (reg[i], mem[i]))
19298         return false;
19299
19300   /* Check if the bases are same.  */
19301   for (int i = 0; i < num_insns - 1; i++)
19302     if (!rtx_equal_p (base[i], base[i + 1]))
19303       return false;
19304
19305   for (int i = 0; i < num_insns; i++)
19306     offvals[i] = INTVAL (offset[i]);
19307
19308   msize = GET_MODE_SIZE (mode);
19309
19310   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
19311   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
19312          aarch64_host_wide_int_compare);
19313
19314   if (!(offvals[1] == offvals[0] + msize
19315         && offvals[3] == offvals[2] + msize))
19316     return false;
19317
19318   /* Check that offsets are within range of each other.  The ldp/stp
19319      instructions have 7 bit immediate offsets, so use 0x80.  */
19320   if (offvals[2] - offvals[0] >= msize * 0x80)
19321     return false;
19322
19323   /* The offsets must be aligned with respect to each other.  */
19324   if (offvals[0] % msize != offvals[2] % msize)
19325     return false;
19326
19327   /* If we have SImode and slow unaligned ldp,
19328      check the alignment to be at least 8 byte. */
19329   if (mode == SImode
19330       && (aarch64_tune_params.extra_tuning_flags
19331           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19332       && !optimize_size
19333       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
19334     return false;
19335
19336   return true;
19337 }
19338
19339 /* Given OPERANDS of consecutive load/store, this function pairs them
19340    into LDP/STP after adjusting the offset.  It depends on the fact
19341    that the operands can be sorted so the offsets are correct for STP.
19342    MODE is the mode of memory operands.  CODE is the rtl operator
19343    which should be applied to all memory operands, it's SIGN_EXTEND,
19344    ZERO_EXTEND or UNKNOWN.  */
19345
19346 bool
19347 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
19348                              scalar_mode mode, RTX_CODE code)
19349 {
19350   rtx base, offset_1, offset_3, t1, t2;
19351   rtx mem_1, mem_2, mem_3, mem_4;
19352   rtx temp_operands[8];
19353   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
19354                 stp_off_upper_limit, stp_off_lower_limit, msize;
19355
19356   /* We make changes on a copy as we may still bail out.  */
19357   for (int i = 0; i < 8; i ++)
19358     temp_operands[i] = operands[i];
19359
19360   /* Sort the operands.  */
19361   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
19362
19363   /* Copy the memory operands so that if we have to bail for some
19364      reason the original addresses are unchanged.  */
19365   if (load)
19366     {
19367       mem_1 = copy_rtx (temp_operands[1]);
19368       mem_2 = copy_rtx (temp_operands[3]);
19369       mem_3 = copy_rtx (temp_operands[5]);
19370       mem_4 = copy_rtx (temp_operands[7]);
19371     }
19372   else
19373     {
19374       mem_1 = copy_rtx (temp_operands[0]);
19375       mem_2 = copy_rtx (temp_operands[2]);
19376       mem_3 = copy_rtx (temp_operands[4]);
19377       mem_4 = copy_rtx (temp_operands[6]);
19378       gcc_assert (code == UNKNOWN);
19379     }
19380
19381   extract_base_offset_in_addr (mem_1, &base, &offset_1);
19382   extract_base_offset_in_addr (mem_3, &base, &offset_3);
19383   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
19384               && offset_3 != NULL_RTX);
19385
19386   /* Adjust offset so it can fit in LDP/STP instruction.  */
19387   msize = GET_MODE_SIZE (mode);
19388   stp_off_upper_limit = msize * (0x40 - 1);
19389   stp_off_lower_limit = - msize * 0x40;
19390
19391   off_val_1 = INTVAL (offset_1);
19392   off_val_3 = INTVAL (offset_3);
19393
19394   /* The base offset is optimally half way between the two STP/LDP offsets.  */
19395   if (msize <= 4)
19396     base_off = (off_val_1 + off_val_3) / 2;
19397   else
19398     /* However, due to issues with negative LDP/STP offset generation for
19399        larger modes, for DF, DI and vector modes. we must not use negative
19400        addresses smaller than 9 signed unadjusted bits can store.  This
19401        provides the most range in this case.  */
19402     base_off = off_val_1;
19403
19404   /* Adjust the base so that it is aligned with the addresses but still
19405      optimal.  */
19406   if (base_off % msize != off_val_1 % msize)
19407     /* Fix the offset, bearing in mind we want to make it bigger not
19408        smaller.  */
19409     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19410   else if (msize <= 4)
19411     /* The negative range of LDP/STP is one larger than the positive range.  */
19412     base_off += msize;
19413
19414   /* Check if base offset is too big or too small.  We can attempt to resolve
19415      this issue by setting it to the maximum value and seeing if the offsets
19416      still fit.  */
19417   if (base_off >= 0x1000)
19418     {
19419       base_off = 0x1000 - 1;
19420       /* We must still make sure that the base offset is aligned with respect
19421          to the address.  But it may may not be made any bigger.  */
19422       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19423     }
19424
19425   /* Likewise for the case where the base is too small.  */
19426   if (base_off <= -0x1000)
19427     {
19428       base_off = -0x1000 + 1;
19429       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19430     }
19431
19432   /* Offset of the first STP/LDP.  */
19433   new_off_1 = off_val_1 - base_off;
19434
19435   /* Offset of the second STP/LDP.  */
19436   new_off_3 = off_val_3 - base_off;
19437
19438   /* The offsets must be within the range of the LDP/STP instructions.  */
19439   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
19440       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
19441     return false;
19442
19443   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
19444                                                   new_off_1), true);
19445   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
19446                                                   new_off_1 + msize), true);
19447   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
19448                                                   new_off_3), true);
19449   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
19450                                                   new_off_3 + msize), true);
19451
19452   if (!aarch64_mem_pair_operand (mem_1, mode)
19453       || !aarch64_mem_pair_operand (mem_3, mode))
19454     return false;
19455
19456   if (code == ZERO_EXTEND)
19457     {
19458       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
19459       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
19460       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
19461       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
19462     }
19463   else if (code == SIGN_EXTEND)
19464     {
19465       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
19466       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
19467       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
19468       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
19469     }
19470
19471   if (load)
19472     {
19473       operands[0] = temp_operands[0];
19474       operands[1] = mem_1;
19475       operands[2] = temp_operands[2];
19476       operands[3] = mem_2;
19477       operands[4] = temp_operands[4];
19478       operands[5] = mem_3;
19479       operands[6] = temp_operands[6];
19480       operands[7] = mem_4;
19481     }
19482   else
19483     {
19484       operands[0] = mem_1;
19485       operands[1] = temp_operands[1];
19486       operands[2] = mem_2;
19487       operands[3] = temp_operands[3];
19488       operands[4] = mem_3;
19489       operands[5] = temp_operands[5];
19490       operands[6] = mem_4;
19491       operands[7] = temp_operands[7];
19492     }
19493
19494   /* Emit adjusting instruction.  */
19495   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
19496   /* Emit ldp/stp instructions.  */
19497   t1 = gen_rtx_SET (operands[0], operands[1]);
19498   t2 = gen_rtx_SET (operands[2], operands[3]);
19499   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19500   t1 = gen_rtx_SET (operands[4], operands[5]);
19501   t2 = gen_rtx_SET (operands[6], operands[7]);
19502   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19503   return true;
19504 }
19505
19506 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
19507    it isn't worth branching around empty masked ops (including masked
19508    stores).  */
19509
19510 static bool
19511 aarch64_empty_mask_is_expensive (unsigned)
19512 {
19513   return false;
19514 }
19515
19516 /* Return 1 if pseudo register should be created and used to hold
19517    GOT address for PIC code.  */
19518
19519 bool
19520 aarch64_use_pseudo_pic_reg (void)
19521 {
19522   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
19523 }
19524
19525 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
19526
19527 static int
19528 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
19529 {
19530   switch (XINT (x, 1))
19531     {
19532     case UNSPEC_GOTSMALLPIC:
19533     case UNSPEC_GOTSMALLPIC28K:
19534     case UNSPEC_GOTTINYPIC:
19535       return 0;
19536     default:
19537       break;
19538     }
19539
19540   return default_unspec_may_trap_p (x, flags);
19541 }
19542
19543
19544 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
19545    return the log2 of that value.  Otherwise return -1.  */
19546
19547 int
19548 aarch64_fpconst_pow_of_2 (rtx x)
19549 {
19550   const REAL_VALUE_TYPE *r;
19551
19552   if (!CONST_DOUBLE_P (x))
19553     return -1;
19554
19555   r = CONST_DOUBLE_REAL_VALUE (x);
19556
19557   if (REAL_VALUE_NEGATIVE (*r)
19558       || REAL_VALUE_ISNAN (*r)
19559       || REAL_VALUE_ISINF (*r)
19560       || !real_isinteger (r, DFmode))
19561     return -1;
19562
19563   return exact_log2 (real_to_integer (r));
19564 }
19565
19566 /* If X is a vector of equal CONST_DOUBLE values and that value is
19567    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
19568
19569 int
19570 aarch64_vec_fpconst_pow_of_2 (rtx x)
19571 {
19572   int nelts;
19573   if (GET_CODE (x) != CONST_VECTOR
19574       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
19575     return -1;
19576
19577   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
19578     return -1;
19579
19580   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
19581   if (firstval <= 0)
19582     return -1;
19583
19584   for (int i = 1; i < nelts; i++)
19585     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
19586       return -1;
19587
19588   return firstval;
19589 }
19590
19591 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
19592    to float.
19593
19594    __fp16 always promotes through this hook.
19595    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
19596    through the generic excess precision logic rather than here.  */
19597
19598 static tree
19599 aarch64_promoted_type (const_tree t)
19600 {
19601   if (SCALAR_FLOAT_TYPE_P (t)
19602       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
19603     return float_type_node;
19604
19605   return NULL_TREE;
19606 }
19607
19608 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
19609
19610 static bool
19611 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
19612                            optimization_type opt_type)
19613 {
19614   switch (op)
19615     {
19616     case rsqrt_optab:
19617       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
19618
19619     default:
19620       return true;
19621     }
19622 }
19623
19624 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
19625
19626 static unsigned int
19627 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
19628                                         int *offset)
19629 {
19630   /* Polynomial invariant 1 == (VG / 2) - 1.  */
19631   gcc_assert (i == 1);
19632   *factor = 2;
19633   *offset = 1;
19634   return AARCH64_DWARF_VG;
19635 }
19636
19637 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
19638    if MODE is HFmode, and punt to the generic implementation otherwise.  */
19639
19640 static bool
19641 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
19642 {
19643   return (mode == HFmode
19644           ? true
19645           : default_libgcc_floating_mode_supported_p (mode));
19646 }
19647
19648 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
19649    if MODE is HFmode, and punt to the generic implementation otherwise.  */
19650
19651 static bool
19652 aarch64_scalar_mode_supported_p (scalar_mode mode)
19653 {
19654   return (mode == HFmode
19655           ? true
19656           : default_scalar_mode_supported_p (mode));
19657 }
19658
19659 /* Set the value of FLT_EVAL_METHOD.
19660    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
19661
19662     0: evaluate all operations and constants, whose semantic type has at
19663        most the range and precision of type float, to the range and
19664        precision of float; evaluate all other operations and constants to
19665        the range and precision of the semantic type;
19666
19667     N, where _FloatN is a supported interchange floating type
19668        evaluate all operations and constants, whose semantic type has at
19669        most the range and precision of _FloatN type, to the range and
19670        precision of the _FloatN type; evaluate all other operations and
19671        constants to the range and precision of the semantic type;
19672
19673    If we have the ARMv8.2-A extensions then we support _Float16 in native
19674    precision, so we should set this to 16.  Otherwise, we support the type,
19675    but want to evaluate expressions in float precision, so set this to
19676    0.  */
19677
19678 static enum flt_eval_method
19679 aarch64_excess_precision (enum excess_precision_type type)
19680 {
19681   switch (type)
19682     {
19683       case EXCESS_PRECISION_TYPE_FAST:
19684       case EXCESS_PRECISION_TYPE_STANDARD:
19685         /* We can calculate either in 16-bit range and precision or
19686            32-bit range and precision.  Make that decision based on whether
19687            we have native support for the ARMv8.2-A 16-bit floating-point
19688            instructions or not.  */
19689         return (TARGET_FP_F16INST
19690                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
19691                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
19692       case EXCESS_PRECISION_TYPE_IMPLICIT:
19693         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
19694       default:
19695         gcc_unreachable ();
19696     }
19697   return FLT_EVAL_METHOD_UNPREDICTABLE;
19698 }
19699
19700 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
19701    scheduled for speculative execution.  Reject the long-running division
19702    and square-root instructions.  */
19703
19704 static bool
19705 aarch64_sched_can_speculate_insn (rtx_insn *insn)
19706 {
19707   switch (get_attr_type (insn))
19708     {
19709       case TYPE_SDIV:
19710       case TYPE_UDIV:
19711       case TYPE_FDIVS:
19712       case TYPE_FDIVD:
19713       case TYPE_FSQRTS:
19714       case TYPE_FSQRTD:
19715       case TYPE_NEON_FP_SQRT_S:
19716       case TYPE_NEON_FP_SQRT_D:
19717       case TYPE_NEON_FP_SQRT_S_Q:
19718       case TYPE_NEON_FP_SQRT_D_Q:
19719       case TYPE_NEON_FP_DIV_S:
19720       case TYPE_NEON_FP_DIV_D:
19721       case TYPE_NEON_FP_DIV_S_Q:
19722       case TYPE_NEON_FP_DIV_D_Q:
19723         return false;
19724       default:
19725         return true;
19726     }
19727 }
19728
19729 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
19730
19731 static int
19732 aarch64_compute_pressure_classes (reg_class *classes)
19733 {
19734   int i = 0;
19735   classes[i++] = GENERAL_REGS;
19736   classes[i++] = FP_REGS;
19737   /* PR_REGS isn't a useful pressure class because many predicate pseudo
19738      registers need to go in PR_LO_REGS at some point during their
19739      lifetime.  Splitting it into two halves has the effect of making
19740      all predicates count against PR_LO_REGS, so that we try whenever
19741      possible to restrict the number of live predicates to 8.  This
19742      greatly reduces the amount of spilling in certain loops.  */
19743   classes[i++] = PR_LO_REGS;
19744   classes[i++] = PR_HI_REGS;
19745   return i;
19746 }
19747
19748 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
19749
19750 static bool
19751 aarch64_can_change_mode_class (machine_mode from,
19752                                machine_mode to, reg_class_t)
19753 {
19754   if (BYTES_BIG_ENDIAN)
19755     {
19756       bool from_sve_p = aarch64_sve_data_mode_p (from);
19757       bool to_sve_p = aarch64_sve_data_mode_p (to);
19758
19759       /* Don't allow changes between SVE data modes and non-SVE modes.
19760          See the comment at the head of aarch64-sve.md for details.  */
19761       if (from_sve_p != to_sve_p)
19762         return false;
19763
19764       /* Don't allow changes in element size: lane 0 of the new vector
19765          would not then be lane 0 of the old vector.  See the comment
19766          above aarch64_maybe_expand_sve_subreg_move for a more detailed
19767          description.
19768
19769          In the worst case, this forces a register to be spilled in
19770          one mode and reloaded in the other, which handles the
19771          endianness correctly.  */
19772       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
19773         return false;
19774     }
19775   return true;
19776 }
19777
19778 /* Implement TARGET_EARLY_REMAT_MODES.  */
19779
19780 static void
19781 aarch64_select_early_remat_modes (sbitmap modes)
19782 {
19783   /* SVE values are not normally live across a call, so it should be
19784      worth doing early rematerialization even in VL-specific mode.  */
19785   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
19786     {
19787       machine_mode mode = (machine_mode) i;
19788       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
19789       if (vec_flags & VEC_ANY_SVE)
19790         bitmap_set_bit (modes, i);
19791     }
19792 }
19793
19794 /* Override the default target speculation_safe_value.  */
19795 static rtx
19796 aarch64_speculation_safe_value (machine_mode mode,
19797                                 rtx result, rtx val, rtx failval)
19798 {
19799   /* Maybe we should warn if falling back to hard barriers.  They are
19800      likely to be noticably more expensive than the alternative below.  */
19801   if (!aarch64_track_speculation)
19802     return default_speculation_safe_value (mode, result, val, failval);
19803
19804   if (!REG_P (val))
19805     val = copy_to_mode_reg (mode, val);
19806
19807   if (!aarch64_reg_or_zero (failval, mode))
19808     failval = copy_to_mode_reg (mode, failval);
19809
19810   emit_insn (gen_despeculate_copy (mode, result, val, failval));
19811   return result;
19812 }
19813
19814 /* Implement TARGET_ESTIMATED_POLY_VALUE.
19815    Look into the tuning structure for an estimate.
19816    VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
19817    Advanced SIMD 128 bits.  */
19818
19819 static HOST_WIDE_INT
19820 aarch64_estimated_poly_value (poly_int64 val)
19821 {
19822   enum aarch64_sve_vector_bits_enum width_source
19823     = aarch64_tune_params.sve_width;
19824
19825   /* If we still don't have an estimate, use the default.  */
19826   if (width_source == SVE_SCALABLE)
19827     return default_estimated_poly_value (val);
19828
19829   HOST_WIDE_INT over_128 = width_source - 128;
19830   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
19831 }
19832
19833
19834 /* Return true for types that could be supported as SIMD return or
19835    argument types.  */
19836
19837 static bool
19838 supported_simd_type (tree t)
19839 {
19840   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
19841     {
19842       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
19843       return s == 1 || s == 2 || s == 4 || s == 8;
19844     }
19845   return false;
19846 }
19847
19848 /* Return true for types that currently are supported as SIMD return
19849    or argument types.  */
19850
19851 static bool
19852 currently_supported_simd_type (tree t, tree b)
19853 {
19854   if (COMPLEX_FLOAT_TYPE_P (t))
19855     return false;
19856
19857   if (TYPE_SIZE (t) != TYPE_SIZE (b))
19858     return false;
19859
19860   return supported_simd_type (t);
19861 }
19862
19863 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
19864
19865 static int
19866 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
19867                                         struct cgraph_simd_clone *clonei,
19868                                         tree base_type, int num)
19869 {
19870   tree t, ret_type, arg_type;
19871   unsigned int elt_bits, vec_bits, count;
19872
19873   if (!TARGET_SIMD)
19874     return 0;
19875
19876   if (clonei->simdlen
19877       && (clonei->simdlen < 2
19878           || clonei->simdlen > 1024
19879           || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
19880     {
19881       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19882                   "unsupported simdlen %d", clonei->simdlen);
19883       return 0;
19884     }
19885
19886   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
19887   if (TREE_CODE (ret_type) != VOID_TYPE
19888       && !currently_supported_simd_type (ret_type, base_type))
19889     {
19890       if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
19891         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19892                     "GCC does not currently support mixed size types "
19893                     "for %<simd%> functions");
19894       else if (supported_simd_type (ret_type))
19895         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19896                     "GCC does not currently support return type %qT "
19897                     "for %<simd%> functions", ret_type);
19898       else
19899         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19900                     "unsupported return type %qT for %<simd%> functions",
19901                     ret_type);
19902       return 0;
19903     }
19904
19905   for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
19906     {
19907       arg_type = TREE_TYPE (t);
19908
19909       if (!currently_supported_simd_type (arg_type, base_type))
19910         {
19911           if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
19912             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19913                         "GCC does not currently support mixed size types "
19914                         "for %<simd%> functions");
19915           else
19916             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19917                         "GCC does not currently support argument type %qT "
19918                         "for %<simd%> functions", arg_type);
19919           return 0;
19920         }
19921     }
19922
19923   clonei->vecsize_mangle = 'n';
19924   clonei->mask_mode = VOIDmode;
19925   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
19926   if (clonei->simdlen == 0)
19927     {
19928       count = 2;
19929       vec_bits = (num == 0 ? 64 : 128);
19930       clonei->simdlen = vec_bits / elt_bits;
19931     }
19932   else
19933     {
19934       count = 1;
19935       vec_bits = clonei->simdlen * elt_bits;
19936       if (vec_bits != 64 && vec_bits != 128)
19937         {
19938           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19939                       "GCC does not currently support simdlen %d for type %qT",
19940                       clonei->simdlen, base_type);
19941           return 0;
19942         }
19943     }
19944   clonei->vecsize_int = vec_bits;
19945   clonei->vecsize_float = vec_bits;
19946   return count;
19947 }
19948
19949 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
19950
19951 static void
19952 aarch64_simd_clone_adjust (struct cgraph_node *node)
19953 {
19954   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
19955      use the correct ABI.  */
19956
19957   tree t = TREE_TYPE (node->decl);
19958   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
19959                                         TYPE_ATTRIBUTES (t));
19960 }
19961
19962 /* Implement TARGET_SIMD_CLONE_USABLE.  */
19963
19964 static int
19965 aarch64_simd_clone_usable (struct cgraph_node *node)
19966 {
19967   switch (node->simdclone->vecsize_mangle)
19968     {
19969     case 'n':
19970       if (!TARGET_SIMD)
19971         return -1;
19972       return 0;
19973     default:
19974       gcc_unreachable ();
19975     }
19976 }
19977
19978 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
19979
19980 static int
19981 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
19982 {
19983   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
19984       != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
19985     return 0;
19986   return 1;
19987 }
19988
19989 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
19990
19991 static const char *
19992 aarch64_get_multilib_abi_name (void)
19993 {
19994   if (TARGET_BIG_END)
19995     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
19996   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
19997 }
19998
19999 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
20000    global variable based guard use the default else
20001    return a null tree.  */
20002 static tree
20003 aarch64_stack_protect_guard (void)
20004 {
20005   if (aarch64_stack_protector_guard == SSP_GLOBAL)
20006     return default_stack_protect_guard ();
20007
20008   return NULL_TREE;
20009 }
20010
20011 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
20012    section at the end if needed.  */
20013 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND      0xc0000000
20014 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI      (1U << 0)
20015 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC      (1U << 1)
20016 void
20017 aarch64_file_end_indicate_exec_stack ()
20018 {
20019   file_end_indicate_exec_stack ();
20020
20021   unsigned feature_1_and = 0;
20022   if (aarch64_bti_enabled ())
20023     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
20024
20025   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
20026     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
20027
20028   if (feature_1_and)
20029     {
20030       /* Generate .note.gnu.property section.  */
20031       switch_to_section (get_section (".note.gnu.property",
20032                                       SECTION_NOTYPE, NULL));
20033
20034       /* PT_NOTE header: namesz, descsz, type.
20035          namesz = 4 ("GNU\0")
20036          descsz = 16 (Size of the program property array)
20037                   [(12 + padding) * Number of array elements]
20038          type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
20039       assemble_align (POINTER_SIZE);
20040       assemble_integer (GEN_INT (4), 4, 32, 1);
20041       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
20042       assemble_integer (GEN_INT (5), 4, 32, 1);
20043
20044       /* PT_NOTE name.  */
20045       assemble_string ("GNU", 4);
20046
20047       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
20048          type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
20049          datasz = 4
20050          data   = feature_1_and.  */
20051       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
20052       assemble_integer (GEN_INT (4), 4, 32, 1);
20053       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
20054
20055       /* Pad the size of the note to the required alignment.  */
20056       assemble_align (POINTER_SIZE);
20057     }
20058 }
20059 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
20060 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
20061 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
20062
20063 /* Target-specific selftests.  */
20064
20065 #if CHECKING_P
20066
20067 namespace selftest {
20068
20069 /* Selftest for the RTL loader.
20070    Verify that the RTL loader copes with a dump from
20071    print_rtx_function.  This is essentially just a test that class
20072    function_reader can handle a real dump, but it also verifies
20073    that lookup_reg_by_dump_name correctly handles hard regs.
20074    The presence of hard reg names in the dump means that the test is
20075    target-specific, hence it is in this file.  */
20076
20077 static void
20078 aarch64_test_loading_full_dump ()
20079 {
20080   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
20081
20082   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
20083
20084   rtx_insn *insn_1 = get_insn_by_uid (1);
20085   ASSERT_EQ (NOTE, GET_CODE (insn_1));
20086
20087   rtx_insn *insn_15 = get_insn_by_uid (15);
20088   ASSERT_EQ (INSN, GET_CODE (insn_15));
20089   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
20090
20091   /* Verify crtl->return_rtx.  */
20092   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
20093   ASSERT_EQ (0, REGNO (crtl->return_rtx));
20094   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
20095 }
20096
20097 /* Run all target-specific selftests.  */
20098
20099 static void
20100 aarch64_run_selftests (void)
20101 {
20102   aarch64_test_loading_full_dump ();
20103 }
20104
20105 } // namespace selftest
20106
20107 #endif /* #if CHECKING_P */
20108
20109 #undef TARGET_STACK_PROTECT_GUARD
20110 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
20111
20112 #undef TARGET_ADDRESS_COST
20113 #define TARGET_ADDRESS_COST aarch64_address_cost
20114
20115 /* This hook will determines whether unnamed bitfields affect the alignment
20116    of the containing structure.  The hook returns true if the structure
20117    should inherit the alignment requirements of an unnamed bitfield's
20118    type.  */
20119 #undef TARGET_ALIGN_ANON_BITFIELD
20120 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
20121
20122 #undef TARGET_ASM_ALIGNED_DI_OP
20123 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
20124
20125 #undef TARGET_ASM_ALIGNED_HI_OP
20126 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
20127
20128 #undef TARGET_ASM_ALIGNED_SI_OP
20129 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
20130
20131 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
20132 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
20133   hook_bool_const_tree_hwi_hwi_const_tree_true
20134
20135 #undef TARGET_ASM_FILE_START
20136 #define TARGET_ASM_FILE_START aarch64_start_file
20137
20138 #undef TARGET_ASM_OUTPUT_MI_THUNK
20139 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
20140
20141 #undef TARGET_ASM_SELECT_RTX_SECTION
20142 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
20143
20144 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
20145 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
20146
20147 #undef TARGET_BUILD_BUILTIN_VA_LIST
20148 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
20149
20150 #undef TARGET_CALLEE_COPIES
20151 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
20152
20153 #undef TARGET_CAN_ELIMINATE
20154 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
20155
20156 #undef TARGET_CAN_INLINE_P
20157 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
20158
20159 #undef TARGET_CANNOT_FORCE_CONST_MEM
20160 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
20161
20162 #undef TARGET_CASE_VALUES_THRESHOLD
20163 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
20164
20165 #undef TARGET_CONDITIONAL_REGISTER_USAGE
20166 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
20167
20168 /* Only the least significant bit is used for initialization guard
20169    variables.  */
20170 #undef TARGET_CXX_GUARD_MASK_BIT
20171 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
20172
20173 #undef TARGET_C_MODE_FOR_SUFFIX
20174 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
20175
20176 #ifdef TARGET_BIG_ENDIAN_DEFAULT
20177 #undef  TARGET_DEFAULT_TARGET_FLAGS
20178 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
20179 #endif
20180
20181 #undef TARGET_CLASS_MAX_NREGS
20182 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
20183
20184 #undef TARGET_BUILTIN_DECL
20185 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
20186
20187 #undef TARGET_BUILTIN_RECIPROCAL
20188 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
20189
20190 #undef TARGET_C_EXCESS_PRECISION
20191 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
20192
20193 #undef  TARGET_EXPAND_BUILTIN
20194 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
20195
20196 #undef TARGET_EXPAND_BUILTIN_VA_START
20197 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
20198
20199 #undef TARGET_FOLD_BUILTIN
20200 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
20201
20202 #undef TARGET_FUNCTION_ARG
20203 #define TARGET_FUNCTION_ARG aarch64_function_arg
20204
20205 #undef TARGET_FUNCTION_ARG_ADVANCE
20206 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
20207
20208 #undef TARGET_FUNCTION_ARG_BOUNDARY
20209 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
20210
20211 #undef TARGET_FUNCTION_ARG_PADDING
20212 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
20213
20214 #undef TARGET_GET_RAW_RESULT_MODE
20215 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
20216 #undef TARGET_GET_RAW_ARG_MODE
20217 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
20218
20219 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
20220 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
20221
20222 #undef TARGET_FUNCTION_VALUE
20223 #define TARGET_FUNCTION_VALUE aarch64_function_value
20224
20225 #undef TARGET_FUNCTION_VALUE_REGNO_P
20226 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
20227
20228 #undef TARGET_GIMPLE_FOLD_BUILTIN
20229 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
20230
20231 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
20232 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
20233
20234 #undef  TARGET_INIT_BUILTINS
20235 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
20236
20237 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
20238 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
20239   aarch64_ira_change_pseudo_allocno_class
20240
20241 #undef TARGET_LEGITIMATE_ADDRESS_P
20242 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
20243
20244 #undef TARGET_LEGITIMATE_CONSTANT_P
20245 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
20246
20247 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
20248 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
20249   aarch64_legitimize_address_displacement
20250
20251 #undef TARGET_LIBGCC_CMP_RETURN_MODE
20252 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
20253
20254 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
20255 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
20256 aarch64_libgcc_floating_mode_supported_p
20257
20258 #undef TARGET_MANGLE_TYPE
20259 #define TARGET_MANGLE_TYPE aarch64_mangle_type
20260
20261 #undef TARGET_MEMORY_MOVE_COST
20262 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
20263
20264 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
20265 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
20266
20267 #undef TARGET_MUST_PASS_IN_STACK
20268 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
20269
20270 /* This target hook should return true if accesses to volatile bitfields
20271    should use the narrowest mode possible.  It should return false if these
20272    accesses should use the bitfield container type.  */
20273 #undef TARGET_NARROW_VOLATILE_BITFIELD
20274 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
20275
20276 #undef  TARGET_OPTION_OVERRIDE
20277 #define TARGET_OPTION_OVERRIDE aarch64_override_options
20278
20279 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
20280 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
20281   aarch64_override_options_after_change
20282
20283 #undef TARGET_OPTION_SAVE
20284 #define TARGET_OPTION_SAVE aarch64_option_save
20285
20286 #undef TARGET_OPTION_RESTORE
20287 #define TARGET_OPTION_RESTORE aarch64_option_restore
20288
20289 #undef TARGET_OPTION_PRINT
20290 #define TARGET_OPTION_PRINT aarch64_option_print
20291
20292 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
20293 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
20294
20295 #undef TARGET_SET_CURRENT_FUNCTION
20296 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
20297
20298 #undef TARGET_PASS_BY_REFERENCE
20299 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
20300
20301 #undef TARGET_PREFERRED_RELOAD_CLASS
20302 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
20303
20304 #undef TARGET_SCHED_REASSOCIATION_WIDTH
20305 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
20306
20307 #undef TARGET_PROMOTED_TYPE
20308 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
20309
20310 #undef TARGET_SECONDARY_RELOAD
20311 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
20312
20313 #undef TARGET_SHIFT_TRUNCATION_MASK
20314 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
20315
20316 #undef TARGET_SETUP_INCOMING_VARARGS
20317 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
20318
20319 #undef TARGET_STRUCT_VALUE_RTX
20320 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
20321
20322 #undef TARGET_REGISTER_MOVE_COST
20323 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
20324
20325 #undef TARGET_RETURN_IN_MEMORY
20326 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
20327
20328 #undef TARGET_RETURN_IN_MSB
20329 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
20330
20331 #undef TARGET_RTX_COSTS
20332 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
20333
20334 #undef TARGET_SCALAR_MODE_SUPPORTED_P
20335 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
20336
20337 #undef TARGET_SCHED_ISSUE_RATE
20338 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
20339
20340 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
20341 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
20342   aarch64_sched_first_cycle_multipass_dfa_lookahead
20343
20344 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
20345 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
20346   aarch64_first_cycle_multipass_dfa_lookahead_guard
20347
20348 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
20349 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
20350   aarch64_get_separate_components
20351
20352 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
20353 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
20354   aarch64_components_for_bb
20355
20356 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
20357 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
20358   aarch64_disqualify_components
20359
20360 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
20361 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
20362   aarch64_emit_prologue_components
20363
20364 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
20365 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
20366   aarch64_emit_epilogue_components
20367
20368 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
20369 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
20370   aarch64_set_handled_components
20371
20372 #undef TARGET_TRAMPOLINE_INIT
20373 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
20374
20375 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
20376 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
20377
20378 #undef TARGET_VECTOR_MODE_SUPPORTED_P
20379 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
20380
20381 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
20382 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
20383   aarch64_builtin_support_vector_misalignment
20384
20385 #undef TARGET_ARRAY_MODE
20386 #define TARGET_ARRAY_MODE aarch64_array_mode
20387
20388 #undef TARGET_ARRAY_MODE_SUPPORTED_P
20389 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
20390
20391 #undef TARGET_VECTORIZE_ADD_STMT_COST
20392 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
20393
20394 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
20395 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
20396   aarch64_builtin_vectorization_cost
20397
20398 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
20399 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
20400
20401 #undef TARGET_VECTORIZE_BUILTINS
20402 #define TARGET_VECTORIZE_BUILTINS
20403
20404 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
20405 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
20406   aarch64_builtin_vectorized_function
20407
20408 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
20409 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
20410   aarch64_autovectorize_vector_sizes
20411
20412 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
20413 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
20414   aarch64_atomic_assign_expand_fenv
20415
20416 /* Section anchor support.  */
20417
20418 #undef TARGET_MIN_ANCHOR_OFFSET
20419 #define TARGET_MIN_ANCHOR_OFFSET -256
20420
20421 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
20422    byte offset; we can do much more for larger data types, but have no way
20423    to determine the size of the access.  We assume accesses are aligned.  */
20424 #undef TARGET_MAX_ANCHOR_OFFSET
20425 #define TARGET_MAX_ANCHOR_OFFSET 4095
20426
20427 #undef TARGET_VECTOR_ALIGNMENT
20428 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
20429
20430 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
20431 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
20432   aarch64_vectorize_preferred_vector_alignment
20433 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
20434 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
20435   aarch64_simd_vector_alignment_reachable
20436
20437 /* vec_perm support.  */
20438
20439 #undef TARGET_VECTORIZE_VEC_PERM_CONST
20440 #define TARGET_VECTORIZE_VEC_PERM_CONST \
20441   aarch64_vectorize_vec_perm_const
20442
20443 #undef TARGET_VECTORIZE_GET_MASK_MODE
20444 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
20445 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
20446 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
20447   aarch64_empty_mask_is_expensive
20448 #undef TARGET_PREFERRED_ELSE_VALUE
20449 #define TARGET_PREFERRED_ELSE_VALUE \
20450   aarch64_preferred_else_value
20451
20452 #undef TARGET_INIT_LIBFUNCS
20453 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
20454
20455 #undef TARGET_FIXED_CONDITION_CODE_REGS
20456 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
20457
20458 #undef TARGET_FLAGS_REGNUM
20459 #define TARGET_FLAGS_REGNUM CC_REGNUM
20460
20461 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
20462 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
20463
20464 #undef TARGET_ASAN_SHADOW_OFFSET
20465 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
20466
20467 #undef TARGET_LEGITIMIZE_ADDRESS
20468 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
20469
20470 #undef TARGET_SCHED_CAN_SPECULATE_INSN
20471 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
20472
20473 #undef TARGET_CAN_USE_DOLOOP_P
20474 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
20475
20476 #undef TARGET_SCHED_ADJUST_PRIORITY
20477 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
20478
20479 #undef TARGET_SCHED_MACRO_FUSION_P
20480 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
20481
20482 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
20483 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
20484
20485 #undef TARGET_SCHED_FUSION_PRIORITY
20486 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
20487
20488 #undef TARGET_UNSPEC_MAY_TRAP_P
20489 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
20490
20491 #undef TARGET_USE_PSEUDO_PIC_REG
20492 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
20493
20494 #undef TARGET_PRINT_OPERAND
20495 #define TARGET_PRINT_OPERAND aarch64_print_operand
20496
20497 #undef TARGET_PRINT_OPERAND_ADDRESS
20498 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
20499
20500 #undef TARGET_OPTAB_SUPPORTED_P
20501 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
20502
20503 #undef TARGET_OMIT_STRUCT_RETURN_REG
20504 #define TARGET_OMIT_STRUCT_RETURN_REG true
20505
20506 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
20507 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
20508   aarch64_dwarf_poly_indeterminate_value
20509
20510 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
20511 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
20512 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
20513
20514 #undef TARGET_HARD_REGNO_NREGS
20515 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
20516 #undef TARGET_HARD_REGNO_MODE_OK
20517 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
20518
20519 #undef TARGET_MODES_TIEABLE_P
20520 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
20521
20522 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
20523 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
20524   aarch64_hard_regno_call_part_clobbered
20525
20526 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
20527 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
20528   aarch64_remove_extra_call_preserved_regs
20529
20530 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
20531 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
20532   aarch64_return_call_with_max_clobbers
20533
20534 #undef TARGET_CONSTANT_ALIGNMENT
20535 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
20536
20537 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
20538 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
20539   aarch64_stack_clash_protection_alloca_probe_range
20540
20541 #undef TARGET_COMPUTE_PRESSURE_CLASSES
20542 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
20543
20544 #undef TARGET_CAN_CHANGE_MODE_CLASS
20545 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
20546
20547 #undef TARGET_SELECT_EARLY_REMAT_MODES
20548 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
20549
20550 #undef TARGET_SPECULATION_SAFE_VALUE
20551 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
20552
20553 #undef TARGET_ESTIMATED_POLY_VALUE
20554 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
20555
20556 #undef TARGET_ATTRIBUTE_TABLE
20557 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
20558
20559 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
20560 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
20561   aarch64_simd_clone_compute_vecsize_and_simdlen
20562
20563 #undef TARGET_SIMD_CLONE_ADJUST
20564 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
20565
20566 #undef TARGET_SIMD_CLONE_USABLE
20567 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
20568
20569 #undef TARGET_COMP_TYPE_ATTRIBUTES
20570 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
20571
20572 #undef TARGET_GET_MULTILIB_ABI_NAME
20573 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
20574
20575 #if CHECKING_P
20576 #undef TARGET_RUN_TARGET_SELFTESTS
20577 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
20578 #endif /* #if CHECKING_P */
20579
20580 #undef TARGET_ASM_POST_CFI_STARTPROC
20581 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
20582
20583 struct gcc_target targetm = TARGET_INITIALIZER;
20584
20585 #include "gt-aarch64.h"