gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2019 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "cgraph.h"
  44 #include "diagnostic.h"
  45 #include "insn-attr.h"
  46 #include "alias.h"
  47 #include "fold-const.h"
  48 #include "stor-layout.h"
  49 #include "calls.h"
  50 #include "varasm.h"
  51 #include "output.h"
  52 #include "flags.h"
  53 #include "explow.h"
  54 #include "expr.h"
  55 #include "reload.h"
  56 #include "langhooks.h"
  57 #include "opts.h"
  58 #include "params.h"
  59 #include "gimplify.h"
  60 #include "dwarf2.h"
  61 #include "gimple-iterator.h"
  62 #include "tree-vectorizer.h"
  63 #include "aarch64-cost-tables.h"
  64 #include "dumpfile.h"
  65 #include "builtins.h"
  66 #include "rtl-iter.h"
  67 #include "tm-constrs.h"
  68 #include "sched-int.h"
  69 #include "target-globals.h"
  70 #include "common/common-target.h"
  71 #include "cfgrtl.h"
  72 #include "selftest.h"
  73 #include "selftest-rtl.h"
  74 #include "rtx-vector-builder.h"
  75 #include "intl.h"
  76
  77 /* This file should be included last.  */
  78 #include "target-def.h"
  79
  80 /* Defined for convenience.  */
  81 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  82
  83 /* Information about a legitimate vector immediate operand.  */
  84 struct simd_immediate_info
  85 {
  86   enum insn_type { MOV, MVN, INDEX, PTRUE };
  87   enum modifier_type { LSL, MSL };
  88
  89   simd_immediate_info () {}
  90   simd_immediate_info (scalar_float_mode, rtx);
  91   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
  92                        insn_type = MOV, modifier_type = LSL,
  93                        unsigned int = 0);
  94   simd_immediate_info (scalar_mode, rtx, rtx);
  95   simd_immediate_info (scalar_int_mode, aarch64_svpattern);
  96
  97   /* The mode of the elements.  */
  98   scalar_mode elt_mode;
  99
 100   /* The instruction to use to move the immediate into a vector.  */
 101   insn_type insn;
 102
 103   union
 104   {
 105     /* For MOV and MVN.  */
 106     struct
 107     {
 108       /* The value of each element.  */
 109       rtx value;
 110
 111       /* The kind of shift modifier to use, and the number of bits to shift.
 112          This is (LSL, 0) if no shift is needed.  */
 113       modifier_type modifier;
 114       unsigned int shift;
 115     } mov;
 116
 117     /* For INDEX.  */
 118     struct
 119     {
 120       /* The value of the first element and the step to be added for each
 121          subsequent element.  */
 122       rtx base, step;
 123     } index;
 124
 125     /* For PTRUE.  */
 126     aarch64_svpattern pattern;
 127   } u;
 128 };
 129
 130 /* Construct a floating-point immediate in which each element has mode
 131    ELT_MODE_IN and value VALUE_IN.  */
 132 inline simd_immediate_info
 133 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 134   : elt_mode (elt_mode_in), insn (MOV)
 135 {
 136   u.mov.value = value_in;
 137   u.mov.modifier = LSL;
 138   u.mov.shift = 0;
 139 }
 140
 141 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 142    and value VALUE_IN.  The other parameters are as for the structure
 143    fields.  */
 144 inline simd_immediate_info
 145 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 146                        unsigned HOST_WIDE_INT value_in,
 147                        insn_type insn_in, modifier_type modifier_in,
 148                        unsigned int shift_in)
 149   : elt_mode (elt_mode_in), insn (insn_in)
 150 {
 151   u.mov.value = gen_int_mode (value_in, elt_mode_in);
 152   u.mov.modifier = modifier_in;
 153   u.mov.shift = shift_in;
 154 }
 155
 156 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 157    and where element I is equal to BASE_IN + I * STEP_IN.  */
 158 inline simd_immediate_info
 159 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
 160   : elt_mode (elt_mode_in), insn (INDEX)
 161 {
 162   u.index.base = base_in;
 163   u.index.step = step_in;
 164 }
 165
 166 /* Construct a predicate that controls elements of mode ELT_MODE_IN
 167    and has PTRUE pattern PATTERN_IN.  */
 168 inline simd_immediate_info
 169 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 170                        aarch64_svpattern pattern_in)
 171   : elt_mode (elt_mode_in), insn (PTRUE)
 172 {
 173   u.pattern = pattern_in;
 174 }
 175
 176 /* The current code model.  */
 177 enum aarch64_code_model aarch64_cmodel;
 178
 179 /* The number of 64-bit elements in an SVE vector.  */
 180 poly_uint16 aarch64_sve_vg;
 181
 182 #ifdef HAVE_AS_TLS
 183 #undef TARGET_HAVE_TLS
 184 #define TARGET_HAVE_TLS 1
 185 #endif
 186
 187 static bool aarch64_composite_type_p (const_tree, machine_mode);
 188 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 189                                                      const_tree,
 190                                                      machine_mode *, int *,
 191                                                      bool *);
 192 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 193 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 194 static void aarch64_override_options_after_change (void);
 195 static bool aarch64_vector_mode_supported_p (machine_mode);
 196 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 197 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 198                                                          const_tree type,
 199                                                          int misalignment,
 200                                                          bool is_packed);
 201 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 202 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 203                                             aarch64_addr_query_type);
 204 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
 205
 206 /* Major revision number of the ARM Architecture implemented by the target.  */
 207 unsigned aarch64_architecture_version;
 208
 209 /* The processor for which instructions should be scheduled.  */
 210 enum aarch64_processor aarch64_tune = cortexa53;
 211
 212 /* Mask to specify which instruction scheduling options should be used.  */
 213 uint64_t aarch64_tune_flags = 0;
 214
 215 /* Global flag for PC relative loads.  */
 216 bool aarch64_pcrelative_literal_loads;
 217
 218 /* Global flag for whether frame pointer is enabled.  */
 219 bool aarch64_use_frame_pointer;
 220
 221 #define BRANCH_PROTECT_STR_MAX 255
 222 char *accepted_branch_protection_string = NULL;
 223
 224 static enum aarch64_parse_opt_result
 225 aarch64_parse_branch_protection (const char*, char**);
 226
 227 /* Support for command line parsing of boolean flags in the tuning
 228    structures.  */
 229 struct aarch64_flag_desc
 230 {
 231   const char* name;
 232   unsigned int flag;
 233 };
 234
 235 #define AARCH64_FUSION_PAIR(name, internal_name) \
 236   { name, AARCH64_FUSE_##internal_name },
 237 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 238 {
 239   { "none", AARCH64_FUSE_NOTHING },
 240 #include "aarch64-fusion-pairs.def"
 241   { "all", AARCH64_FUSE_ALL },
 242   { NULL, AARCH64_FUSE_NOTHING }
 243 };
 244
 245 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 246   { name, AARCH64_EXTRA_TUNE_##internal_name },
 247 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 248 {
 249   { "none", AARCH64_EXTRA_TUNE_NONE },
 250 #include "aarch64-tuning-flags.def"
 251   { "all", AARCH64_EXTRA_TUNE_ALL },
 252   { NULL, AARCH64_EXTRA_TUNE_NONE }
 253 };
 254
 255 /* Tuning parameters.  */
 256
 257 static const struct cpu_addrcost_table generic_addrcost_table =
 258 {
 259     {
 260       1, /* hi  */
 261       0, /* si  */
 262       0, /* di  */
 263       1, /* ti  */
 264     },
 265   0, /* pre_modify  */
 266   0, /* post_modify  */
 267   0, /* register_offset  */
 268   0, /* register_sextend  */
 269   0, /* register_zextend  */
 270   0 /* imm_offset  */
 271 };
 272
 273 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 274 {
 275     {
 276       0, /* hi  */
 277       0, /* si  */
 278       0, /* di  */
 279       2, /* ti  */
 280     },
 281   0, /* pre_modify  */
 282   0, /* post_modify  */
 283   1, /* register_offset  */
 284   1, /* register_sextend  */
 285   2, /* register_zextend  */
 286   0, /* imm_offset  */
 287 };
 288
 289 static const struct cpu_addrcost_table xgene1_addrcost_table =
 290 {
 291     {
 292       1, /* hi  */
 293       0, /* si  */
 294       0, /* di  */
 295       1, /* ti  */
 296     },
 297   1, /* pre_modify  */
 298   1, /* post_modify  */
 299   0, /* register_offset  */
 300   1, /* register_sextend  */
 301   1, /* register_zextend  */
 302   0, /* imm_offset  */
 303 };
 304
 305 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 306 {
 307     {
 308       1, /* hi  */
 309       1, /* si  */
 310       1, /* di  */
 311       2, /* ti  */
 312     },
 313   0, /* pre_modify  */
 314   0, /* post_modify  */
 315   2, /* register_offset  */
 316   3, /* register_sextend  */
 317   3, /* register_zextend  */
 318   0, /* imm_offset  */
 319 };
 320
 321 static const struct cpu_addrcost_table tsv110_addrcost_table =
 322 {
 323     {
 324       1, /* hi  */
 325       0, /* si  */
 326       0, /* di  */
 327       1, /* ti  */
 328     },
 329   0, /* pre_modify  */
 330   0, /* post_modify  */
 331   0, /* register_offset  */
 332   1, /* register_sextend  */
 333   1, /* register_zextend  */
 334   0, /* imm_offset  */
 335 };
 336
 337 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 338 {
 339     {
 340       1, /* hi  */
 341       1, /* si  */
 342       1, /* di  */
 343       2, /* ti  */
 344     },
 345   1, /* pre_modify  */
 346   1, /* post_modify  */
 347   3, /* register_offset  */
 348   3, /* register_sextend  */
 349   3, /* register_zextend  */
 350   2, /* imm_offset  */
 351 };
 352
 353 static const struct cpu_regmove_cost generic_regmove_cost =
 354 {
 355   1, /* GP2GP  */
 356   /* Avoid the use of slow int<->fp moves for spilling by setting
 357      their cost higher than memmov_cost.  */
 358   5, /* GP2FP  */
 359   5, /* FP2GP  */
 360   2 /* FP2FP  */
 361 };
 362
 363 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 364 {
 365   1, /* GP2GP  */
 366   /* Avoid the use of slow int<->fp moves for spilling by setting
 367      their cost higher than memmov_cost.  */
 368   5, /* GP2FP  */
 369   5, /* FP2GP  */
 370   2 /* FP2FP  */
 371 };
 372
 373 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 374 {
 375   1, /* GP2GP  */
 376   /* Avoid the use of slow int<->fp moves for spilling by setting
 377      their cost higher than memmov_cost.  */
 378   5, /* GP2FP  */
 379   5, /* FP2GP  */
 380   2 /* FP2FP  */
 381 };
 382
 383 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 384 {
 385   1, /* GP2GP  */
 386   /* Avoid the use of slow int<->fp moves for spilling by setting
 387      their cost higher than memmov_cost (actual, 4 and 9).  */
 388   9, /* GP2FP  */
 389   9, /* FP2GP  */
 390   1 /* FP2FP  */
 391 };
 392
 393 static const struct cpu_regmove_cost thunderx_regmove_cost =
 394 {
 395   2, /* GP2GP  */
 396   2, /* GP2FP  */
 397   6, /* FP2GP  */
 398   4 /* FP2FP  */
 399 };
 400
 401 static const struct cpu_regmove_cost xgene1_regmove_cost =
 402 {
 403   1, /* GP2GP  */
 404   /* Avoid the use of slow int<->fp moves for spilling by setting
 405      their cost higher than memmov_cost.  */
 406   8, /* GP2FP  */
 407   8, /* FP2GP  */
 408   2 /* FP2FP  */
 409 };
 410
 411 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 412 {
 413   2, /* GP2GP  */
 414   /* Avoid the use of int<->fp moves for spilling.  */
 415   6, /* GP2FP  */
 416   6, /* FP2GP  */
 417   4 /* FP2FP  */
 418 };
 419
 420 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 421 {
 422   1, /* GP2GP  */
 423   /* Avoid the use of int<->fp moves for spilling.  */
 424   8, /* GP2FP  */
 425   8, /* FP2GP  */
 426   4  /* FP2FP  */
 427 };
 428
 429 static const struct cpu_regmove_cost tsv110_regmove_cost =
 430 {
 431   1, /* GP2GP  */
 432   /* Avoid the use of slow int<->fp moves for spilling by setting
 433      their cost higher than memmov_cost.  */
 434   2, /* GP2FP  */
 435   3, /* FP2GP  */
 436   2  /* FP2FP  */
 437 };
 438
 439 /* Generic costs for vector insn classes.  */
 440 static const struct cpu_vector_cost generic_vector_cost =
 441 {
 442   1, /* scalar_int_stmt_cost  */
 443   1, /* scalar_fp_stmt_cost  */
 444   1, /* scalar_load_cost  */
 445   1, /* scalar_store_cost  */
 446   1, /* vec_int_stmt_cost  */
 447   1, /* vec_fp_stmt_cost  */
 448   2, /* vec_permute_cost  */
 449   1, /* vec_to_scalar_cost  */
 450   1, /* scalar_to_vec_cost  */
 451   1, /* vec_align_load_cost  */
 452   1, /* vec_unalign_load_cost  */
 453   1, /* vec_unalign_store_cost  */
 454   1, /* vec_store_cost  */
 455   3, /* cond_taken_branch_cost  */
 456   1 /* cond_not_taken_branch_cost  */
 457 };
 458
 459 /* QDF24XX costs for vector insn classes.  */
 460 static const struct cpu_vector_cost qdf24xx_vector_cost =
 461 {
 462   1, /* scalar_int_stmt_cost  */
 463   1, /* scalar_fp_stmt_cost  */
 464   1, /* scalar_load_cost  */
 465   1, /* scalar_store_cost  */
 466   1, /* vec_int_stmt_cost  */
 467   3, /* vec_fp_stmt_cost  */
 468   2, /* vec_permute_cost  */
 469   1, /* vec_to_scalar_cost  */
 470   1, /* scalar_to_vec_cost  */
 471   1, /* vec_align_load_cost  */
 472   1, /* vec_unalign_load_cost  */
 473   1, /* vec_unalign_store_cost  */
 474   1, /* vec_store_cost  */
 475   3, /* cond_taken_branch_cost  */
 476   1 /* cond_not_taken_branch_cost  */
 477 };
 478
 479 /* ThunderX costs for vector insn classes.  */
 480 static const struct cpu_vector_cost thunderx_vector_cost =
 481 {
 482   1, /* scalar_int_stmt_cost  */
 483   1, /* scalar_fp_stmt_cost  */
 484   3, /* scalar_load_cost  */
 485   1, /* scalar_store_cost  */
 486   4, /* vec_int_stmt_cost  */
 487   1, /* vec_fp_stmt_cost  */
 488   4, /* vec_permute_cost  */
 489   2, /* vec_to_scalar_cost  */
 490   2, /* scalar_to_vec_cost  */
 491   3, /* vec_align_load_cost  */
 492   5, /* vec_unalign_load_cost  */
 493   5, /* vec_unalign_store_cost  */
 494   1, /* vec_store_cost  */
 495   3, /* cond_taken_branch_cost  */
 496   3 /* cond_not_taken_branch_cost  */
 497 };
 498
 499 static const struct cpu_vector_cost tsv110_vector_cost =
 500 {
 501   1, /* scalar_int_stmt_cost  */
 502   1, /* scalar_fp_stmt_cost  */
 503   5, /* scalar_load_cost  */
 504   1, /* scalar_store_cost  */
 505   2, /* vec_int_stmt_cost  */
 506   2, /* vec_fp_stmt_cost  */
 507   2, /* vec_permute_cost  */
 508   3, /* vec_to_scalar_cost  */
 509   2, /* scalar_to_vec_cost  */
 510   5, /* vec_align_load_cost  */
 511   5, /* vec_unalign_load_cost  */
 512   1, /* vec_unalign_store_cost  */
 513   1, /* vec_store_cost  */
 514   1, /* cond_taken_branch_cost  */
 515   1 /* cond_not_taken_branch_cost  */
 516 };
 517
 518 /* Generic costs for vector insn classes.  */
 519 static const struct cpu_vector_cost cortexa57_vector_cost =
 520 {
 521   1, /* scalar_int_stmt_cost  */
 522   1, /* scalar_fp_stmt_cost  */
 523   4, /* scalar_load_cost  */
 524   1, /* scalar_store_cost  */
 525   2, /* vec_int_stmt_cost  */
 526   2, /* vec_fp_stmt_cost  */
 527   3, /* vec_permute_cost  */
 528   8, /* vec_to_scalar_cost  */
 529   8, /* scalar_to_vec_cost  */
 530   4, /* vec_align_load_cost  */
 531   4, /* vec_unalign_load_cost  */
 532   1, /* vec_unalign_store_cost  */
 533   1, /* vec_store_cost  */
 534   1, /* cond_taken_branch_cost  */
 535   1 /* cond_not_taken_branch_cost  */
 536 };
 537
 538 static const struct cpu_vector_cost exynosm1_vector_cost =
 539 {
 540   1, /* scalar_int_stmt_cost  */
 541   1, /* scalar_fp_stmt_cost  */
 542   5, /* scalar_load_cost  */
 543   1, /* scalar_store_cost  */
 544   3, /* vec_int_stmt_cost  */
 545   3, /* vec_fp_stmt_cost  */
 546   3, /* vec_permute_cost  */
 547   3, /* vec_to_scalar_cost  */
 548   3, /* scalar_to_vec_cost  */
 549   5, /* vec_align_load_cost  */
 550   5, /* vec_unalign_load_cost  */
 551   1, /* vec_unalign_store_cost  */
 552   1, /* vec_store_cost  */
 553   1, /* cond_taken_branch_cost  */
 554   1 /* cond_not_taken_branch_cost  */
 555 };
 556
 557 /* Generic costs for vector insn classes.  */
 558 static const struct cpu_vector_cost xgene1_vector_cost =
 559 {
 560   1, /* scalar_int_stmt_cost  */
 561   1, /* scalar_fp_stmt_cost  */
 562   5, /* scalar_load_cost  */
 563   1, /* scalar_store_cost  */
 564   2, /* vec_int_stmt_cost  */
 565   2, /* vec_fp_stmt_cost  */
 566   2, /* vec_permute_cost  */
 567   4, /* vec_to_scalar_cost  */
 568   4, /* scalar_to_vec_cost  */
 569   10, /* vec_align_load_cost  */
 570   10, /* vec_unalign_load_cost  */
 571   2, /* vec_unalign_store_cost  */
 572   2, /* vec_store_cost  */
 573   2, /* cond_taken_branch_cost  */
 574   1 /* cond_not_taken_branch_cost  */
 575 };
 576
 577 /* Costs for vector insn classes for Vulcan.  */
 578 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 579 {
 580   1, /* scalar_int_stmt_cost  */
 581   6, /* scalar_fp_stmt_cost  */
 582   4, /* scalar_load_cost  */
 583   1, /* scalar_store_cost  */
 584   5, /* vec_int_stmt_cost  */
 585   6, /* vec_fp_stmt_cost  */
 586   3, /* vec_permute_cost  */
 587   6, /* vec_to_scalar_cost  */
 588   5, /* scalar_to_vec_cost  */
 589   8, /* vec_align_load_cost  */
 590   8, /* vec_unalign_load_cost  */
 591   4, /* vec_unalign_store_cost  */
 592   4, /* vec_store_cost  */
 593   2, /* cond_taken_branch_cost  */
 594   1  /* cond_not_taken_branch_cost  */
 595 };
 596
 597 /* Generic costs for branch instructions.  */
 598 static const struct cpu_branch_cost generic_branch_cost =
 599 {
 600   1,  /* Predictable.  */
 601   3   /* Unpredictable.  */
 602 };
 603
 604 /* Generic approximation modes.  */
 605 static const cpu_approx_modes generic_approx_modes =
 606 {
 607   AARCH64_APPROX_NONE,  /* division  */
 608   AARCH64_APPROX_NONE,  /* sqrt  */
 609   AARCH64_APPROX_NONE   /* recip_sqrt  */
 610 };
 611
 612 /* Approximation modes for Exynos M1.  */
 613 static const cpu_approx_modes exynosm1_approx_modes =
 614 {
 615   AARCH64_APPROX_NONE,  /* division  */
 616   AARCH64_APPROX_ALL,   /* sqrt  */
 617   AARCH64_APPROX_ALL    /* recip_sqrt  */
 618 };
 619
 620 /* Approximation modes for X-Gene 1.  */
 621 static const cpu_approx_modes xgene1_approx_modes =
 622 {
 623   AARCH64_APPROX_NONE,  /* division  */
 624   AARCH64_APPROX_NONE,  /* sqrt  */
 625   AARCH64_APPROX_ALL    /* recip_sqrt  */
 626 };
 627
 628 /* Generic prefetch settings (which disable prefetch).  */
 629 static const cpu_prefetch_tune generic_prefetch_tune =
 630 {
 631   0,                    /* num_slots  */
 632   -1,                   /* l1_cache_size  */
 633   -1,                   /* l1_cache_line_size  */
 634   -1,                   /* l2_cache_size  */
 635   true,                 /* prefetch_dynamic_strides */
 636   -1,                   /* minimum_stride */
 637   -1                    /* default_opt_level  */
 638 };
 639
 640 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 641 {
 642   0,                    /* num_slots  */
 643   -1,                   /* l1_cache_size  */
 644   64,                   /* l1_cache_line_size  */
 645   -1,                   /* l2_cache_size  */
 646   true,                 /* prefetch_dynamic_strides */
 647   -1,                   /* minimum_stride */
 648   -1                    /* default_opt_level  */
 649 };
 650
 651 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 652 {
 653   4,                    /* num_slots  */
 654   32,                   /* l1_cache_size  */
 655   64,                   /* l1_cache_line_size  */
 656   512,                  /* l2_cache_size  */
 657   false,                /* prefetch_dynamic_strides */
 658   2048,                 /* minimum_stride */
 659   3                     /* default_opt_level  */
 660 };
 661
 662 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 663 {
 664   8,                    /* num_slots  */
 665   32,                   /* l1_cache_size  */
 666   128,                  /* l1_cache_line_size  */
 667   16*1024,              /* l2_cache_size  */
 668   true,                 /* prefetch_dynamic_strides */
 669   -1,                   /* minimum_stride */
 670   3                     /* default_opt_level  */
 671 };
 672
 673 static const cpu_prefetch_tune thunderx_prefetch_tune =
 674 {
 675   8,                    /* num_slots  */
 676   32,                   /* l1_cache_size  */
 677   128,                  /* l1_cache_line_size  */
 678   -1,                   /* l2_cache_size  */
 679   true,                 /* prefetch_dynamic_strides */
 680   -1,                   /* minimum_stride */
 681   -1                    /* default_opt_level  */
 682 };
 683
 684 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 685 {
 686   8,                    /* num_slots  */
 687   32,                   /* l1_cache_size  */
 688   64,                   /* l1_cache_line_size  */
 689   256,                  /* l2_cache_size  */
 690   true,                 /* prefetch_dynamic_strides */
 691   -1,                   /* minimum_stride */
 692   -1                    /* default_opt_level  */
 693 };
 694
 695 static const cpu_prefetch_tune tsv110_prefetch_tune =
 696 {
 697   0,                    /* num_slots  */
 698   64,                   /* l1_cache_size  */
 699   64,                   /* l1_cache_line_size  */
 700   512,                  /* l2_cache_size  */
 701   true,                 /* prefetch_dynamic_strides */
 702   -1,                   /* minimum_stride */
 703   -1                    /* default_opt_level  */
 704 };
 705
 706 static const cpu_prefetch_tune xgene1_prefetch_tune =
 707 {
 708   8,                    /* num_slots  */
 709   32,                   /* l1_cache_size  */
 710   64,                   /* l1_cache_line_size  */
 711   256,                  /* l2_cache_size  */
 712   true,                 /* prefetch_dynamic_strides */
 713   -1,                   /* minimum_stride */
 714   -1                    /* default_opt_level  */
 715 };
 716
 717 static const struct tune_params generic_tunings =
 718 {
 719   &cortexa57_extra_costs,
 720   &generic_addrcost_table,
 721   &generic_regmove_cost,
 722   &generic_vector_cost,
 723   &generic_branch_cost,
 724   &generic_approx_modes,
 725   SVE_NOT_IMPLEMENTED, /* sve_width  */
 726   4, /* memmov_cost  */
 727   2, /* issue_rate  */
 728   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 729   "16:12",      /* function_align.  */
 730   "4",  /* jump_align.  */
 731   "8",  /* loop_align.  */
 732   2,    /* int_reassoc_width.  */
 733   4,    /* fp_reassoc_width.  */
 734   1,    /* vec_reassoc_width.  */
 735   2,    /* min_div_recip_mul_sf.  */
 736   2,    /* min_div_recip_mul_df.  */
 737   0,    /* max_case_values.  */
 738   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 739   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 740   &generic_prefetch_tune
 741 };
 742
 743 static const struct tune_params cortexa35_tunings =
 744 {
 745   &cortexa53_extra_costs,
 746   &generic_addrcost_table,
 747   &cortexa53_regmove_cost,
 748   &generic_vector_cost,
 749   &generic_branch_cost,
 750   &generic_approx_modes,
 751   SVE_NOT_IMPLEMENTED, /* sve_width  */
 752   4, /* memmov_cost  */
 753   1, /* issue_rate  */
 754   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 755    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 756   "16", /* function_align.  */
 757   "4",  /* jump_align.  */
 758   "8",  /* loop_align.  */
 759   2,    /* int_reassoc_width.  */
 760   4,    /* fp_reassoc_width.  */
 761   1,    /* vec_reassoc_width.  */
 762   2,    /* min_div_recip_mul_sf.  */
 763   2,    /* min_div_recip_mul_df.  */
 764   0,    /* max_case_values.  */
 765   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 766   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 767   &generic_prefetch_tune
 768 };
 769
 770 static const struct tune_params cortexa53_tunings =
 771 {
 772   &cortexa53_extra_costs,
 773   &generic_addrcost_table,
 774   &cortexa53_regmove_cost,
 775   &generic_vector_cost,
 776   &generic_branch_cost,
 777   &generic_approx_modes,
 778   SVE_NOT_IMPLEMENTED, /* sve_width  */
 779   4, /* memmov_cost  */
 780   2, /* issue_rate  */
 781   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 782    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 783   "16", /* function_align.  */
 784   "4",  /* jump_align.  */
 785   "8",  /* loop_align.  */
 786   2,    /* int_reassoc_width.  */
 787   4,    /* fp_reassoc_width.  */
 788   1,    /* vec_reassoc_width.  */
 789   2,    /* min_div_recip_mul_sf.  */
 790   2,    /* min_div_recip_mul_df.  */
 791   0,    /* max_case_values.  */
 792   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 793   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 794   &generic_prefetch_tune
 795 };
 796
 797 static const struct tune_params cortexa57_tunings =
 798 {
 799   &cortexa57_extra_costs,
 800   &generic_addrcost_table,
 801   &cortexa57_regmove_cost,
 802   &cortexa57_vector_cost,
 803   &generic_branch_cost,
 804   &generic_approx_modes,
 805   SVE_NOT_IMPLEMENTED, /* sve_width  */
 806   4, /* memmov_cost  */
 807   3, /* issue_rate  */
 808   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 809    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 810   "16", /* function_align.  */
 811   "4",  /* jump_align.  */
 812   "8",  /* loop_align.  */
 813   2,    /* int_reassoc_width.  */
 814   4,    /* fp_reassoc_width.  */
 815   1,    /* vec_reassoc_width.  */
 816   2,    /* min_div_recip_mul_sf.  */
 817   2,    /* min_div_recip_mul_df.  */
 818   0,    /* max_case_values.  */
 819   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 820   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 821   &generic_prefetch_tune
 822 };
 823
 824 static const struct tune_params cortexa72_tunings =
 825 {
 826   &cortexa57_extra_costs,
 827   &generic_addrcost_table,
 828   &cortexa57_regmove_cost,
 829   &cortexa57_vector_cost,
 830   &generic_branch_cost,
 831   &generic_approx_modes,
 832   SVE_NOT_IMPLEMENTED, /* sve_width  */
 833   4, /* memmov_cost  */
 834   3, /* issue_rate  */
 835   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 836    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 837   "16", /* function_align.  */
 838   "4",  /* jump_align.  */
 839   "8",  /* loop_align.  */
 840   2,    /* int_reassoc_width.  */
 841   4,    /* fp_reassoc_width.  */
 842   1,    /* vec_reassoc_width.  */
 843   2,    /* min_div_recip_mul_sf.  */
 844   2,    /* min_div_recip_mul_df.  */
 845   0,    /* max_case_values.  */
 846   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 847   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 848   &generic_prefetch_tune
 849 };
 850
 851 static const struct tune_params cortexa73_tunings =
 852 {
 853   &cortexa57_extra_costs,
 854   &generic_addrcost_table,
 855   &cortexa57_regmove_cost,
 856   &cortexa57_vector_cost,
 857   &generic_branch_cost,
 858   &generic_approx_modes,
 859   SVE_NOT_IMPLEMENTED, /* sve_width  */
 860   4, /* memmov_cost.  */
 861   2, /* issue_rate.  */
 862   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 863    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 864   "16", /* function_align.  */
 865   "4",  /* jump_align.  */
 866   "8",  /* loop_align.  */
 867   2,    /* int_reassoc_width.  */
 868   4,    /* fp_reassoc_width.  */
 869   1,    /* vec_reassoc_width.  */
 870   2,    /* min_div_recip_mul_sf.  */
 871   2,    /* min_div_recip_mul_df.  */
 872   0,    /* max_case_values.  */
 873   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 874   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 875   &generic_prefetch_tune
 876 };
 877
 878
 879
 880 static const struct tune_params exynosm1_tunings =
 881 {
 882   &exynosm1_extra_costs,
 883   &exynosm1_addrcost_table,
 884   &exynosm1_regmove_cost,
 885   &exynosm1_vector_cost,
 886   &generic_branch_cost,
 887   &exynosm1_approx_modes,
 888   SVE_NOT_IMPLEMENTED, /* sve_width  */
 889   4,    /* memmov_cost  */
 890   3,    /* issue_rate  */
 891   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 892   "4",  /* function_align.  */
 893   "4",  /* jump_align.  */
 894   "4",  /* loop_align.  */
 895   2,    /* int_reassoc_width.  */
 896   4,    /* fp_reassoc_width.  */
 897   1,    /* vec_reassoc_width.  */
 898   2,    /* min_div_recip_mul_sf.  */
 899   2,    /* min_div_recip_mul_df.  */
 900   48,   /* max_case_values.  */
 901   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 902   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 903   &exynosm1_prefetch_tune
 904 };
 905
 906 static const struct tune_params thunderxt88_tunings =
 907 {
 908   &thunderx_extra_costs,
 909   &generic_addrcost_table,
 910   &thunderx_regmove_cost,
 911   &thunderx_vector_cost,
 912   &generic_branch_cost,
 913   &generic_approx_modes,
 914   SVE_NOT_IMPLEMENTED, /* sve_width  */
 915   6, /* memmov_cost  */
 916   2, /* issue_rate  */
 917   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 918   "8",  /* function_align.  */
 919   "8",  /* jump_align.  */
 920   "8",  /* loop_align.  */
 921   2,    /* int_reassoc_width.  */
 922   4,    /* fp_reassoc_width.  */
 923   1,    /* vec_reassoc_width.  */
 924   2,    /* min_div_recip_mul_sf.  */
 925   2,    /* min_div_recip_mul_df.  */
 926   0,    /* max_case_values.  */
 927   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 928   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 929   &thunderxt88_prefetch_tune
 930 };
 931
 932 static const struct tune_params thunderx_tunings =
 933 {
 934   &thunderx_extra_costs,
 935   &generic_addrcost_table,
 936   &thunderx_regmove_cost,
 937   &thunderx_vector_cost,
 938   &generic_branch_cost,
 939   &generic_approx_modes,
 940   SVE_NOT_IMPLEMENTED, /* sve_width  */
 941   6, /* memmov_cost  */
 942   2, /* issue_rate  */
 943   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 944   "8",  /* function_align.  */
 945   "8",  /* jump_align.  */
 946   "8",  /* loop_align.  */
 947   2,    /* int_reassoc_width.  */
 948   4,    /* fp_reassoc_width.  */
 949   1,    /* vec_reassoc_width.  */
 950   2,    /* min_div_recip_mul_sf.  */
 951   2,    /* min_div_recip_mul_df.  */
 952   0,    /* max_case_values.  */
 953   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 954   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 955    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 956   &thunderx_prefetch_tune
 957 };
 958
 959 static const struct tune_params tsv110_tunings =
 960 {
 961   &tsv110_extra_costs,
 962   &tsv110_addrcost_table,
 963   &tsv110_regmove_cost,
 964   &tsv110_vector_cost,
 965   &generic_branch_cost,
 966   &generic_approx_modes,
 967   SVE_NOT_IMPLEMENTED, /* sve_width  */
 968   4,    /* memmov_cost  */
 969   4,    /* issue_rate  */
 970   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
 971    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 972   "16", /* function_align.  */
 973   "4",  /* jump_align.  */
 974   "8",  /* loop_align.  */
 975   2,    /* int_reassoc_width.  */
 976   4,    /* fp_reassoc_width.  */
 977   1,    /* vec_reassoc_width.  */
 978   2,    /* min_div_recip_mul_sf.  */
 979   2,    /* min_div_recip_mul_df.  */
 980   0,    /* max_case_values.  */
 981   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 982   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
 983   &tsv110_prefetch_tune
 984 };
 985
 986 static const struct tune_params xgene1_tunings =
 987 {
 988   &xgene1_extra_costs,
 989   &xgene1_addrcost_table,
 990   &xgene1_regmove_cost,
 991   &xgene1_vector_cost,
 992   &generic_branch_cost,
 993   &xgene1_approx_modes,
 994   SVE_NOT_IMPLEMENTED, /* sve_width  */
 995   6, /* memmov_cost  */
 996   4, /* issue_rate  */
 997   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 998   "16", /* function_align.  */
 999   "16", /* jump_align.  */
1000   "16", /* loop_align.  */
1001   2,    /* int_reassoc_width.  */
1002   4,    /* fp_reassoc_width.  */
1003   1,    /* vec_reassoc_width.  */
1004   2,    /* min_div_recip_mul_sf.  */
1005   2,    /* min_div_recip_mul_df.  */
1006   17,   /* max_case_values.  */
1007   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1008   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1009   &xgene1_prefetch_tune
1010 };
1011
1012 static const struct tune_params emag_tunings =
1013 {
1014   &xgene1_extra_costs,
1015   &xgene1_addrcost_table,
1016   &xgene1_regmove_cost,
1017   &xgene1_vector_cost,
1018   &generic_branch_cost,
1019   &xgene1_approx_modes,
1020   SVE_NOT_IMPLEMENTED,
1021   6, /* memmov_cost  */
1022   4, /* issue_rate  */
1023   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1024   "16", /* function_align.  */
1025   "16", /* jump_align.  */
1026   "16", /* loop_align.  */
1027   2,    /* int_reassoc_width.  */
1028   4,    /* fp_reassoc_width.  */
1029   1,    /* vec_reassoc_width.  */
1030   2,    /* min_div_recip_mul_sf.  */
1031   2,    /* min_div_recip_mul_df.  */
1032   17,   /* max_case_values.  */
1033   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1034   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1035   &xgene1_prefetch_tune
1036 };
1037
1038 static const struct tune_params qdf24xx_tunings =
1039 {
1040   &qdf24xx_extra_costs,
1041   &qdf24xx_addrcost_table,
1042   &qdf24xx_regmove_cost,
1043   &qdf24xx_vector_cost,
1044   &generic_branch_cost,
1045   &generic_approx_modes,
1046   SVE_NOT_IMPLEMENTED, /* sve_width  */
1047   4, /* memmov_cost  */
1048   4, /* issue_rate  */
1049   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1050    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1051   "16", /* function_align.  */
1052   "8",  /* jump_align.  */
1053   "16", /* loop_align.  */
1054   2,    /* int_reassoc_width.  */
1055   4,    /* fp_reassoc_width.  */
1056   1,    /* vec_reassoc_width.  */
1057   2,    /* min_div_recip_mul_sf.  */
1058   2,    /* min_div_recip_mul_df.  */
1059   0,    /* max_case_values.  */
1060   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1061   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1062   &qdf24xx_prefetch_tune
1063 };
1064
1065 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1066    for now.  */
1067 static const struct tune_params saphira_tunings =
1068 {
1069   &generic_extra_costs,
1070   &generic_addrcost_table,
1071   &generic_regmove_cost,
1072   &generic_vector_cost,
1073   &generic_branch_cost,
1074   &generic_approx_modes,
1075   SVE_NOT_IMPLEMENTED, /* sve_width  */
1076   4, /* memmov_cost  */
1077   4, /* issue_rate  */
1078   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1079    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1080   "16", /* function_align.  */
1081   "8",  /* jump_align.  */
1082   "16", /* loop_align.  */
1083   2,    /* int_reassoc_width.  */
1084   4,    /* fp_reassoc_width.  */
1085   1,    /* vec_reassoc_width.  */
1086   2,    /* min_div_recip_mul_sf.  */
1087   2,    /* min_div_recip_mul_df.  */
1088   0,    /* max_case_values.  */
1089   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1090   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1091   &generic_prefetch_tune
1092 };
1093
1094 static const struct tune_params thunderx2t99_tunings =
1095 {
1096   &thunderx2t99_extra_costs,
1097   &thunderx2t99_addrcost_table,
1098   &thunderx2t99_regmove_cost,
1099   &thunderx2t99_vector_cost,
1100   &generic_branch_cost,
1101   &generic_approx_modes,
1102   SVE_NOT_IMPLEMENTED, /* sve_width  */
1103   4, /* memmov_cost.  */
1104   4, /* issue_rate.  */
1105   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1106    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
1107   "16", /* function_align.  */
1108   "8",  /* jump_align.  */
1109   "16", /* loop_align.  */
1110   3,    /* int_reassoc_width.  */
1111   2,    /* fp_reassoc_width.  */
1112   2,    /* vec_reassoc_width.  */
1113   2,    /* min_div_recip_mul_sf.  */
1114   2,    /* min_div_recip_mul_df.  */
1115   0,    /* max_case_values.  */
1116   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1117   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1118   &thunderx2t99_prefetch_tune
1119 };
1120
1121 static const struct tune_params neoversen1_tunings =
1122 {
1123   &cortexa57_extra_costs,
1124   &generic_addrcost_table,
1125   &generic_regmove_cost,
1126   &cortexa57_vector_cost,
1127   &generic_branch_cost,
1128   &generic_approx_modes,
1129   SVE_NOT_IMPLEMENTED, /* sve_width  */
1130   4, /* memmov_cost  */
1131   3, /* issue_rate  */
1132   AARCH64_FUSE_AES_AESMC, /* fusible_ops  */
1133   "32:16",      /* function_align.  */
1134   "32:16",      /* jump_align.  */
1135   "32:16",      /* loop_align.  */
1136   2,    /* int_reassoc_width.  */
1137   4,    /* fp_reassoc_width.  */
1138   2,    /* vec_reassoc_width.  */
1139   2,    /* min_div_recip_mul_sf.  */
1140   2,    /* min_div_recip_mul_df.  */
1141   0,    /* max_case_values.  */
1142   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1143   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1144   &generic_prefetch_tune
1145 };
1146
1147 /* Support for fine-grained override of the tuning structures.  */
1148 struct aarch64_tuning_override_function
1149 {
1150   const char* name;
1151   void (*parse_override)(const char*, struct tune_params*);
1152 };
1153
1154 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1155 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1156 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1157
1158 static const struct aarch64_tuning_override_function
1159 aarch64_tuning_override_functions[] =
1160 {
1161   { "fuse", aarch64_parse_fuse_string },
1162   { "tune", aarch64_parse_tune_string },
1163   { "sve_width", aarch64_parse_sve_width_string },
1164   { NULL, NULL }
1165 };
1166
1167 /* A processor implementing AArch64.  */
1168 struct processor
1169 {
1170   const char *const name;
1171   enum aarch64_processor ident;
1172   enum aarch64_processor sched_core;
1173   enum aarch64_arch arch;
1174   unsigned architecture_version;
1175   const uint64_t flags;
1176   const struct tune_params *const tune;
1177 };
1178
1179 /* Architectures implementing AArch64.  */
1180 static const struct processor all_architectures[] =
1181 {
1182 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1183   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1184 #include "aarch64-arches.def"
1185   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1186 };
1187
1188 /* Processor cores implementing AArch64.  */
1189 static const struct processor all_cores[] =
1190 {
1191 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1192   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
1193   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
1194   FLAGS, &COSTS##_tunings},
1195 #include "aarch64-cores.def"
1196   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1197     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1198   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1199 };
1200
1201
1202 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1203    handling code or by target attributes.  */
1204 static const struct processor *selected_arch;
1205 static const struct processor *selected_cpu;
1206 static const struct processor *selected_tune;
1207
1208 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1209
1210 /* The current tuning set.  */
1211 struct tune_params aarch64_tune_params = generic_tunings;
1212
1213 /* Table of machine attributes.  */
1214 static const struct attribute_spec aarch64_attribute_table[] =
1215 {
1216   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1217        affects_type_identity, handler, exclude } */
1218   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,  NULL, NULL },
1219   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
1220 };
1221
1222 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1223
1224 /* An ISA extension in the co-processor and main instruction set space.  */
1225 struct aarch64_option_extension
1226 {
1227   const char *const name;
1228   const unsigned long flags_on;
1229   const unsigned long flags_off;
1230 };
1231
1232 typedef enum aarch64_cond_code
1233 {
1234   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1235   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1236   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1237 }
1238 aarch64_cc;
1239
1240 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1241
1242 struct aarch64_branch_protect_type
1243 {
1244   /* The type's name that the user passes to the branch-protection option
1245     string.  */
1246   const char* name;
1247   /* Function to handle the protection type and set global variables.
1248     First argument is the string token corresponding with this type and the
1249     second argument is the next token in the option string.
1250     Return values:
1251     * AARCH64_PARSE_OK: Handling was sucessful.
1252     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1253       should print an error.
1254     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1255       own error.  */
1256   enum aarch64_parse_opt_result (*handler)(char*, char*);
1257   /* A list of types that can follow this type in the option string.  */
1258   const aarch64_branch_protect_type* subtypes;
1259   unsigned int num_subtypes;
1260 };
1261
1262 static enum aarch64_parse_opt_result
1263 aarch64_handle_no_branch_protection (char* str, char* rest)
1264 {
1265   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1266   aarch64_enable_bti = 0;
1267   if (rest)
1268     {
1269       error ("unexpected %<%s%> after %<%s%>", rest, str);
1270       return AARCH64_PARSE_INVALID_FEATURE;
1271     }
1272   return AARCH64_PARSE_OK;
1273 }
1274
1275 static enum aarch64_parse_opt_result
1276 aarch64_handle_standard_branch_protection (char* str, char* rest)
1277 {
1278   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1279   aarch64_ra_sign_key = AARCH64_KEY_A;
1280   aarch64_enable_bti = 1;
1281   if (rest)
1282     {
1283       error ("unexpected %<%s%> after %<%s%>", rest, str);
1284       return AARCH64_PARSE_INVALID_FEATURE;
1285     }
1286   return AARCH64_PARSE_OK;
1287 }
1288
1289 static enum aarch64_parse_opt_result
1290 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1291                                     char* rest ATTRIBUTE_UNUSED)
1292 {
1293   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1294   aarch64_ra_sign_key = AARCH64_KEY_A;
1295   return AARCH64_PARSE_OK;
1296 }
1297
1298 static enum aarch64_parse_opt_result
1299 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1300                               char* rest ATTRIBUTE_UNUSED)
1301 {
1302   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1303   return AARCH64_PARSE_OK;
1304 }
1305
1306 static enum aarch64_parse_opt_result
1307 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1308                               char* rest ATTRIBUTE_UNUSED)
1309 {
1310   aarch64_ra_sign_key = AARCH64_KEY_B;
1311   return AARCH64_PARSE_OK;
1312 }
1313
1314 static enum aarch64_parse_opt_result
1315 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1316                                     char* rest ATTRIBUTE_UNUSED)
1317 {
1318   aarch64_enable_bti = 1;
1319   return AARCH64_PARSE_OK;
1320 }
1321
1322 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1323   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1324   { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1325   { NULL, NULL, NULL, 0 }
1326 };
1327
1328 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1329   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1330   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1331   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1332     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1333   { "bti", aarch64_handle_bti_protection, NULL, 0 },
1334   { NULL, NULL, NULL, 0 }
1335 };
1336
1337 /* The condition codes of the processor, and the inverse function.  */
1338 static const char * const aarch64_condition_codes[] =
1339 {
1340   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1341   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1342 };
1343
1344 /* The preferred condition codes for SVE conditions.  */
1345 static const char *const aarch64_sve_condition_codes[] =
1346 {
1347   "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1348   "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1349 };
1350
1351 /* Return the assembly token for svpattern value VALUE.  */
1352
1353 static const char *
1354 svpattern_token (enum aarch64_svpattern pattern)
1355 {
1356   switch (pattern)
1357     {
1358 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1359     AARCH64_FOR_SVPATTERN (CASE)
1360 #undef CASE
1361     case AARCH64_NUM_SVPATTERNS:
1362       break;
1363     }
1364   gcc_unreachable ();
1365 }
1366
1367 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1368 const char *
1369 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1370                         const char * branch_format)
1371 {
1372     rtx_code_label * tmp_label = gen_label_rtx ();
1373     char label_buf[256];
1374     char buffer[128];
1375     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1376                                  CODE_LABEL_NUMBER (tmp_label));
1377     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1378     rtx dest_label = operands[pos_label];
1379     operands[pos_label] = tmp_label;
1380
1381     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1382     output_asm_insn (buffer, operands);
1383
1384     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1385     operands[pos_label] = dest_label;
1386     output_asm_insn (buffer, operands);
1387     return "";
1388 }
1389
1390 void
1391 aarch64_err_no_fpadvsimd (machine_mode mode)
1392 {
1393   if (TARGET_GENERAL_REGS_ONLY)
1394     if (FLOAT_MODE_P (mode))
1395       error ("%qs is incompatible with the use of floating-point types",
1396              "-mgeneral-regs-only");
1397     else
1398       error ("%qs is incompatible with the use of vector types",
1399              "-mgeneral-regs-only");
1400   else
1401     if (FLOAT_MODE_P (mode))
1402       error ("%qs feature modifier is incompatible with the use of"
1403              " floating-point types", "+nofp");
1404     else
1405       error ("%qs feature modifier is incompatible with the use of"
1406              " vector types", "+nofp");
1407 }
1408
1409 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1410    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1411    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1412    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1413    and GENERAL_REGS is lower than the memory cost (in this case the best class
1414    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1415    cost results in bad allocations with many redundant int<->FP moves which
1416    are expensive on various cores.
1417    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1418    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1419    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1420    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1421    The result of this is that it is no longer inefficient to have a higher
1422    memory move cost than the register move cost.
1423 */
1424
1425 static reg_class_t
1426 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1427                                          reg_class_t best_class)
1428 {
1429   machine_mode mode;
1430
1431   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1432       || !reg_class_subset_p (FP_REGS, allocno_class))
1433     return allocno_class;
1434
1435   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1436       || !reg_class_subset_p (FP_REGS, best_class))
1437     return best_class;
1438
1439   mode = PSEUDO_REGNO_MODE (regno);
1440   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1441 }
1442
1443 static unsigned int
1444 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1445 {
1446   if (GET_MODE_UNIT_SIZE (mode) == 4)
1447     return aarch64_tune_params.min_div_recip_mul_sf;
1448   return aarch64_tune_params.min_div_recip_mul_df;
1449 }
1450
1451 /* Return the reassociation width of treeop OPC with mode MODE.  */
1452 static int
1453 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1454 {
1455   if (VECTOR_MODE_P (mode))
1456     return aarch64_tune_params.vec_reassoc_width;
1457   if (INTEGRAL_MODE_P (mode))
1458     return aarch64_tune_params.int_reassoc_width;
1459   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1460   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1461     return aarch64_tune_params.fp_reassoc_width;
1462   return 1;
1463 }
1464
1465 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1466 unsigned
1467 aarch64_dbx_register_number (unsigned regno)
1468 {
1469    if (GP_REGNUM_P (regno))
1470      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1471    else if (regno == SP_REGNUM)
1472      return AARCH64_DWARF_SP;
1473    else if (FP_REGNUM_P (regno))
1474      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1475    else if (PR_REGNUM_P (regno))
1476      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1477    else if (regno == VG_REGNUM)
1478      return AARCH64_DWARF_VG;
1479
1480    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1481       equivalent DWARF register.  */
1482    return DWARF_FRAME_REGISTERS;
1483 }
1484
1485 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1486    integer, otherwise return X unmodified.  */
1487 static rtx
1488 aarch64_bit_representation (rtx x)
1489 {
1490   if (CONST_DOUBLE_P (x))
1491     x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
1492   return x;
1493 }
1494
1495 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1496 static bool
1497 aarch64_advsimd_struct_mode_p (machine_mode mode)
1498 {
1499   return (TARGET_SIMD
1500           && (mode == OImode || mode == CImode || mode == XImode));
1501 }
1502
1503 /* Return true if MODE is an SVE predicate mode.  */
1504 static bool
1505 aarch64_sve_pred_mode_p (machine_mode mode)
1506 {
1507   return (TARGET_SVE
1508           && (mode == VNx16BImode
1509               || mode == VNx8BImode
1510               || mode == VNx4BImode
1511               || mode == VNx2BImode));
1512 }
1513
1514 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1515 const unsigned int VEC_ADVSIMD  = 1;
1516 const unsigned int VEC_SVE_DATA = 2;
1517 const unsigned int VEC_SVE_PRED = 4;
1518 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1519    a structure of 2, 3 or 4 vectors.  */
1520 const unsigned int VEC_STRUCT   = 8;
1521 /* Useful combinations of the above.  */
1522 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1523 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1524
1525 /* Return a set of flags describing the vector properties of mode MODE.
1526    Ignore modes that are not supported by the current target.  */
1527 static unsigned int
1528 aarch64_classify_vector_mode (machine_mode mode)
1529 {
1530   if (aarch64_advsimd_struct_mode_p (mode))
1531     return VEC_ADVSIMD | VEC_STRUCT;
1532
1533   if (aarch64_sve_pred_mode_p (mode))
1534     return VEC_SVE_PRED;
1535
1536   /* Make the decision based on the mode's enum value rather than its
1537      properties, so that we keep the correct classification regardless
1538      of -msve-vector-bits.  */
1539   switch (mode)
1540     {
1541     /* Single SVE vectors.  */
1542     case E_VNx16QImode:
1543     case E_VNx8HImode:
1544     case E_VNx4SImode:
1545     case E_VNx2DImode:
1546     case E_VNx8HFmode:
1547     case E_VNx4SFmode:
1548     case E_VNx2DFmode:
1549       return TARGET_SVE ? VEC_SVE_DATA : 0;
1550
1551     /* x2 SVE vectors.  */
1552     case E_VNx32QImode:
1553     case E_VNx16HImode:
1554     case E_VNx8SImode:
1555     case E_VNx4DImode:
1556     case E_VNx16HFmode:
1557     case E_VNx8SFmode:
1558     case E_VNx4DFmode:
1559     /* x3 SVE vectors.  */
1560     case E_VNx48QImode:
1561     case E_VNx24HImode:
1562     case E_VNx12SImode:
1563     case E_VNx6DImode:
1564     case E_VNx24HFmode:
1565     case E_VNx12SFmode:
1566     case E_VNx6DFmode:
1567     /* x4 SVE vectors.  */
1568     case E_VNx64QImode:
1569     case E_VNx32HImode:
1570     case E_VNx16SImode:
1571     case E_VNx8DImode:
1572     case E_VNx32HFmode:
1573     case E_VNx16SFmode:
1574     case E_VNx8DFmode:
1575       return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
1576
1577     /* 64-bit Advanced SIMD vectors.  */
1578     case E_V8QImode:
1579     case E_V4HImode:
1580     case E_V2SImode:
1581     /* ...E_V1DImode doesn't exist.  */
1582     case E_V4HFmode:
1583     case E_V2SFmode:
1584     case E_V1DFmode:
1585     /* 128-bit Advanced SIMD vectors.  */
1586     case E_V16QImode:
1587     case E_V8HImode:
1588     case E_V4SImode:
1589     case E_V2DImode:
1590     case E_V8HFmode:
1591     case E_V4SFmode:
1592     case E_V2DFmode:
1593       return TARGET_SIMD ? VEC_ADVSIMD : 0;
1594
1595     default:
1596       return 0;
1597     }
1598 }
1599
1600 /* Return true if MODE is any of the data vector modes, including
1601    structure modes.  */
1602 static bool
1603 aarch64_vector_data_mode_p (machine_mode mode)
1604 {
1605   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1606 }
1607
1608 /* Return true if MODE is an SVE data vector mode; either a single vector
1609    or a structure of vectors.  */
1610 static bool
1611 aarch64_sve_data_mode_p (machine_mode mode)
1612 {
1613   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1614 }
1615
1616 /* Implement target hook TARGET_ARRAY_MODE.  */
1617 static opt_machine_mode
1618 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1619 {
1620   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1621       && IN_RANGE (nelems, 2, 4))
1622     return mode_for_vector (GET_MODE_INNER (mode),
1623                             GET_MODE_NUNITS (mode) * nelems);
1624
1625   return opt_machine_mode ();
1626 }
1627
1628 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1629 static bool
1630 aarch64_array_mode_supported_p (machine_mode mode,
1631                                 unsigned HOST_WIDE_INT nelems)
1632 {
1633   if (TARGET_SIMD
1634       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1635           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1636       && (nelems >= 2 && nelems <= 4))
1637     return true;
1638
1639   return false;
1640 }
1641
1642 /* Return the SVE predicate mode to use for elements that have
1643    ELEM_NBYTES bytes, if such a mode exists.  */
1644
1645 opt_machine_mode
1646 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1647 {
1648   if (TARGET_SVE)
1649     {
1650       if (elem_nbytes == 1)
1651         return VNx16BImode;
1652       if (elem_nbytes == 2)
1653         return VNx8BImode;
1654       if (elem_nbytes == 4)
1655         return VNx4BImode;
1656       if (elem_nbytes == 8)
1657         return VNx2BImode;
1658     }
1659   return opt_machine_mode ();
1660 }
1661
1662 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1663
1664 static opt_machine_mode
1665 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1666 {
1667   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1668     {
1669       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1670       machine_mode pred_mode;
1671       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1672         return pred_mode;
1673     }
1674
1675   return default_get_mask_mode (nunits, nbytes);
1676 }
1677
1678 /* Return the integer element mode associated with SVE mode MODE.  */
1679
1680 static scalar_int_mode
1681 aarch64_sve_element_int_mode (machine_mode mode)
1682 {
1683   unsigned int elt_bits = vector_element_size (BITS_PER_SVE_VECTOR,
1684                                                GET_MODE_NUNITS (mode));
1685   return int_mode_for_size (elt_bits, 0).require ();
1686 }
1687
1688 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
1689    prefer to use the first arithmetic operand as the else value if
1690    the else value doesn't matter, since that exactly matches the SVE
1691    destructive merging form.  For ternary operations we could either
1692    pick the first operand and use FMAD-like instructions or the last
1693    operand and use FMLA-like instructions; the latter seems more
1694    natural.  */
1695
1696 static tree
1697 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1698 {
1699   return nops == 3 ? ops[2] : ops[0];
1700 }
1701
1702 /* Implement TARGET_HARD_REGNO_NREGS.  */
1703
1704 static unsigned int
1705 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1706 {
1707   /* ??? Logically we should only need to provide a value when
1708      HARD_REGNO_MODE_OK says that the combination is valid,
1709      but at the moment we need to handle all modes.  Just ignore
1710      any runtime parts for registers that can't store them.  */
1711   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1712   switch (aarch64_regno_regclass (regno))
1713     {
1714     case FP_REGS:
1715     case FP_LO_REGS:
1716     case FP_LO8_REGS:
1717       if (aarch64_sve_data_mode_p (mode))
1718         return exact_div (GET_MODE_SIZE (mode),
1719                           BYTES_PER_SVE_VECTOR).to_constant ();
1720       return CEIL (lowest_size, UNITS_PER_VREG);
1721     case PR_REGS:
1722     case PR_LO_REGS:
1723     case PR_HI_REGS:
1724       return 1;
1725     default:
1726       return CEIL (lowest_size, UNITS_PER_WORD);
1727     }
1728   gcc_unreachable ();
1729 }
1730
1731 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1732
1733 static bool
1734 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1735 {
1736   if (GET_MODE_CLASS (mode) == MODE_CC)
1737     return regno == CC_REGNUM;
1738
1739   if (regno == VG_REGNUM)
1740     /* This must have the same size as _Unwind_Word.  */
1741     return mode == DImode;
1742
1743   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1744   if (vec_flags & VEC_SVE_PRED)
1745     return PR_REGNUM_P (regno);
1746
1747   if (PR_REGNUM_P (regno))
1748     return 0;
1749
1750   if (regno == SP_REGNUM)
1751     /* The purpose of comparing with ptr_mode is to support the
1752        global register variable associated with the stack pointer
1753        register via the syntax of asm ("wsp") in ILP32.  */
1754     return mode == Pmode || mode == ptr_mode;
1755
1756   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1757     return mode == Pmode;
1758
1759   if (GP_REGNUM_P (regno))
1760     {
1761       if (known_le (GET_MODE_SIZE (mode), 8))
1762         return true;
1763       else if (known_le (GET_MODE_SIZE (mode), 16))
1764         return (regno & 1) == 0;
1765     }
1766   else if (FP_REGNUM_P (regno))
1767     {
1768       if (vec_flags & VEC_STRUCT)
1769         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1770       else
1771         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1772     }
1773
1774   return false;
1775 }
1776
1777 /* Return true if this is a definition of a vectorized simd function.  */
1778
1779 static bool
1780 aarch64_simd_decl_p (tree fndecl)
1781 {
1782   tree fntype;
1783
1784   if (fndecl == NULL)
1785     return false;
1786   fntype = TREE_TYPE (fndecl);
1787   if (fntype == NULL)
1788     return false;
1789
1790   /* Functions with the aarch64_vector_pcs attribute use the simd ABI.  */
1791   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1792     return true;
1793
1794   return false;
1795 }
1796
1797 /* Return the mode a register save/restore should use.  DImode for integer
1798    registers, DFmode for FP registers in non-SIMD functions (they only save
1799    the bottom half of a 128 bit register), or TFmode for FP registers in
1800    SIMD functions.  */
1801
1802 static machine_mode
1803 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1804 {
1805   return GP_REGNUM_P (regno)
1806            ? E_DImode
1807            : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1808 }
1809
1810 /* Return true if the instruction is a call to a SIMD function, false
1811    if it is not a SIMD function or if we do not know anything about
1812    the function.  */
1813
1814 static bool
1815 aarch64_simd_call_p (rtx_insn *insn)
1816 {
1817   rtx symbol;
1818   rtx call;
1819   tree fndecl;
1820
1821   gcc_assert (CALL_P (insn));
1822   call = get_call_rtx_from (insn);
1823   symbol = XEXP (XEXP (call, 0), 0);
1824   if (GET_CODE (symbol) != SYMBOL_REF)
1825     return false;
1826   fndecl = SYMBOL_REF_DECL (symbol);
1827   if (!fndecl)
1828     return false;
1829
1830   return aarch64_simd_decl_p (fndecl);
1831 }
1832
1833 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS.  If INSN calls
1834    a function that uses the SIMD ABI, take advantage of the extra
1835    call-preserved registers that the ABI provides.  */
1836
1837 void
1838 aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
1839                                           HARD_REG_SET *return_set)
1840 {
1841   if (aarch64_simd_call_p (insn))
1842     {
1843       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1844         if (FP_SIMD_SAVED_REGNUM_P (regno))
1845           CLEAR_HARD_REG_BIT (*return_set, regno);
1846     }
1847 }
1848
1849 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1850    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1851    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1852
1853 static bool
1854 aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
1855                                         machine_mode mode)
1856 {
1857   bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
1858   return FP_REGNUM_P (regno)
1859          && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
1860 }
1861
1862 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS.  */
1863
1864 rtx_insn *
1865 aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
1866 {
1867   gcc_assert (CALL_P (call_1) && CALL_P (call_2));
1868
1869   if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
1870     return call_1;
1871   else
1872     return call_2;
1873 }
1874
1875 /* Implement REGMODE_NATURAL_SIZE.  */
1876 poly_uint64
1877 aarch64_regmode_natural_size (machine_mode mode)
1878 {
1879   /* The natural size for SVE data modes is one SVE data vector,
1880      and similarly for predicates.  We can't independently modify
1881      anything smaller than that.  */
1882   /* ??? For now, only do this for variable-width SVE registers.
1883      Doing it for constant-sized registers breaks lower-subreg.c.  */
1884   /* ??? And once that's fixed, we should probably have similar
1885      code for Advanced SIMD.  */
1886   if (!aarch64_sve_vg.is_constant ())
1887     {
1888       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1889       if (vec_flags & VEC_SVE_PRED)
1890         return BYTES_PER_SVE_PRED;
1891       if (vec_flags & VEC_SVE_DATA)
1892         return BYTES_PER_SVE_VECTOR;
1893     }
1894   return UNITS_PER_WORD;
1895 }
1896
1897 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1898 machine_mode
1899 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1900                                      machine_mode mode)
1901 {
1902   /* The predicate mode determines which bits are significant and
1903      which are "don't care".  Decreasing the number of lanes would
1904      lose data while increasing the number of lanes would make bits
1905      unnecessarily significant.  */
1906   if (PR_REGNUM_P (regno))
1907     return mode;
1908   if (known_ge (GET_MODE_SIZE (mode), 4))
1909     return mode;
1910   else
1911     return SImode;
1912 }
1913
1914 /* Return true if I's bits are consecutive ones from the MSB.  */
1915 bool
1916 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1917 {
1918   return exact_log2 (-i) != HOST_WIDE_INT_M1;
1919 }
1920
1921 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1922    that strcpy from constants will be faster.  */
1923
1924 static HOST_WIDE_INT
1925 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1926 {
1927   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1928     return MAX (align, BITS_PER_WORD);
1929   return align;
1930 }
1931
1932 /* Return true if calls to DECL should be treated as
1933    long-calls (ie called via a register).  */
1934 static bool
1935 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1936 {
1937   return false;
1938 }
1939
1940 /* Return true if calls to symbol-ref SYM should be treated as
1941    long-calls (ie called via a register).  */
1942 bool
1943 aarch64_is_long_call_p (rtx sym)
1944 {
1945   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1946 }
1947
1948 /* Return true if calls to symbol-ref SYM should not go through
1949    plt stubs.  */
1950
1951 bool
1952 aarch64_is_noplt_call_p (rtx sym)
1953 {
1954   const_tree decl = SYMBOL_REF_DECL (sym);
1955
1956   if (flag_pic
1957       && decl
1958       && (!flag_plt
1959           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1960       && !targetm.binds_local_p (decl))
1961     return true;
1962
1963   return false;
1964 }
1965
1966 /* Return true if the offsets to a zero/sign-extract operation
1967    represent an expression that matches an extend operation.  The
1968    operands represent the paramters from
1969
1970    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1971 bool
1972 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1973                                 rtx extract_imm)
1974 {
1975   HOST_WIDE_INT mult_val, extract_val;
1976
1977   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1978     return false;
1979
1980   mult_val = INTVAL (mult_imm);
1981   extract_val = INTVAL (extract_imm);
1982
1983   if (extract_val > 8
1984       && extract_val < GET_MODE_BITSIZE (mode)
1985       && exact_log2 (extract_val & ~7) > 0
1986       && (extract_val & 7) <= 4
1987       && mult_val == (1 << (extract_val & 7)))
1988     return true;
1989
1990   return false;
1991 }
1992
1993 /* Emit an insn that's a simple single-set.  Both the operands must be
1994    known to be valid.  */
1995 inline static rtx_insn *
1996 emit_set_insn (rtx x, rtx y)
1997 {
1998   return emit_insn (gen_rtx_SET (x, y));
1999 }
2000
2001 /* X and Y are two things to compare using CODE.  Emit the compare insn and
2002    return the rtx for register 0 in the proper mode.  */
2003 rtx
2004 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2005 {
2006   machine_mode mode = SELECT_CC_MODE (code, x, y);
2007   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
2008
2009   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
2010   return cc_reg;
2011 }
2012
2013 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
2014
2015 static rtx
2016 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2017                                   machine_mode y_mode)
2018 {
2019   if (y_mode == E_QImode || y_mode == E_HImode)
2020     {
2021       if (CONST_INT_P (y))
2022         y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2023       else
2024         {
2025           rtx t, cc_reg;
2026           machine_mode cc_mode;
2027
2028           t = gen_rtx_ZERO_EXTEND (SImode, y);
2029           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2030           cc_mode = CC_SWPmode;
2031           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2032           emit_set_insn (cc_reg, t);
2033           return cc_reg;
2034         }
2035     }
2036
2037   return aarch64_gen_compare_reg (code, x, y);
2038 }
2039
2040 /* Build the SYMBOL_REF for __tls_get_addr.  */
2041
2042 static GTY(()) rtx tls_get_addr_libfunc;
2043
2044 rtx
2045 aarch64_tls_get_addr (void)
2046 {
2047   if (!tls_get_addr_libfunc)
2048     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2049   return tls_get_addr_libfunc;
2050 }
2051
2052 /* Return the TLS model to use for ADDR.  */
2053
2054 static enum tls_model
2055 tls_symbolic_operand_type (rtx addr)
2056 {
2057   enum tls_model tls_kind = TLS_MODEL_NONE;
2058   if (GET_CODE (addr) == CONST)
2059     {
2060       poly_int64 addend;
2061       rtx sym = strip_offset (addr, &addend);
2062       if (GET_CODE (sym) == SYMBOL_REF)
2063         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2064     }
2065   else if (GET_CODE (addr) == SYMBOL_REF)
2066     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2067
2068   return tls_kind;
2069 }
2070
2071 /* We'll allow lo_sum's in addresses in our legitimate addresses
2072    so that combine would take care of combining addresses where
2073    necessary, but for generation purposes, we'll generate the address
2074    as :
2075    RTL                               Absolute
2076    tmp = hi (symbol_ref);            adrp  x1, foo
2077    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
2078                                      nop
2079
2080    PIC                               TLS
2081    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
2082    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
2083                                      bl   __tls_get_addr
2084                                      nop
2085
2086    Load TLS symbol, depending on TLS mechanism and TLS access model.
2087
2088    Global Dynamic - Traditional TLS:
2089    adrp tmp, :tlsgd:imm
2090    add  dest, tmp, #:tlsgd_lo12:imm
2091    bl   __tls_get_addr
2092
2093    Global Dynamic - TLS Descriptors:
2094    adrp dest, :tlsdesc:imm
2095    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
2096    add  dest, dest, #:tlsdesc_lo12:imm
2097    blr  tmp
2098    mrs  tp, tpidr_el0
2099    add  dest, dest, tp
2100
2101    Initial Exec:
2102    mrs  tp, tpidr_el0
2103    adrp tmp, :gottprel:imm
2104    ldr  dest, [tmp, #:gottprel_lo12:imm]
2105    add  dest, dest, tp
2106
2107    Local Exec:
2108    mrs  tp, tpidr_el0
2109    add  t0, tp, #:tprel_hi12:imm, lsl #12
2110    add  t0, t0, #:tprel_lo12_nc:imm
2111 */
2112
2113 static void
2114 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2115                                    enum aarch64_symbol_type type)
2116 {
2117   switch (type)
2118     {
2119     case SYMBOL_SMALL_ABSOLUTE:
2120       {
2121         /* In ILP32, the mode of dest can be either SImode or DImode.  */
2122         rtx tmp_reg = dest;
2123         machine_mode mode = GET_MODE (dest);
2124
2125         gcc_assert (mode == Pmode || mode == ptr_mode);
2126
2127         if (can_create_pseudo_p ())
2128           tmp_reg = gen_reg_rtx (mode);
2129
2130         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2131         emit_insn (gen_add_losym (dest, tmp_reg, imm));
2132         return;
2133       }
2134
2135     case SYMBOL_TINY_ABSOLUTE:
2136       emit_insn (gen_rtx_SET (dest, imm));
2137       return;
2138
2139     case SYMBOL_SMALL_GOT_28K:
2140       {
2141         machine_mode mode = GET_MODE (dest);
2142         rtx gp_rtx = pic_offset_table_rtx;
2143         rtx insn;
2144         rtx mem;
2145
2146         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2147            here before rtl expand.  Tree IVOPT will generate rtl pattern to
2148            decide rtx costs, in which case pic_offset_table_rtx is not
2149            initialized.  For that case no need to generate the first adrp
2150            instruction as the final cost for global variable access is
2151            one instruction.  */
2152         if (gp_rtx != NULL)
2153           {
2154             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2155                using the page base as GOT base, the first page may be wasted,
2156                in the worst scenario, there is only 28K space for GOT).
2157
2158                The generate instruction sequence for accessing global variable
2159                is:
2160
2161                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2162
2163                Only one instruction needed. But we must initialize
2164                pic_offset_table_rtx properly.  We generate initialize insn for
2165                every global access, and allow CSE to remove all redundant.
2166
2167                The final instruction sequences will look like the following
2168                for multiply global variables access.
2169
2170                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2171
2172                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2173                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2174                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2175                  ...  */
2176
2177             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2178             crtl->uses_pic_offset_table = 1;
2179             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2180
2181             if (mode != GET_MODE (gp_rtx))
2182              gp_rtx = gen_lowpart (mode, gp_rtx);
2183
2184           }
2185
2186         if (mode == ptr_mode)
2187           {
2188             if (mode == DImode)
2189               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2190             else
2191               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2192
2193             mem = XVECEXP (SET_SRC (insn), 0, 0);
2194           }
2195         else
2196           {
2197             gcc_assert (mode == Pmode);
2198
2199             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2200             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2201           }
2202
2203         /* The operand is expected to be MEM.  Whenever the related insn
2204            pattern changed, above code which calculate mem should be
2205            updated.  */
2206         gcc_assert (GET_CODE (mem) == MEM);
2207         MEM_READONLY_P (mem) = 1;
2208         MEM_NOTRAP_P (mem) = 1;
2209         emit_insn (insn);
2210         return;
2211       }
2212
2213     case SYMBOL_SMALL_GOT_4G:
2214       {
2215         /* In ILP32, the mode of dest can be either SImode or DImode,
2216            while the got entry is always of SImode size.  The mode of
2217            dest depends on how dest is used: if dest is assigned to a
2218            pointer (e.g. in the memory), it has SImode; it may have
2219            DImode if dest is dereferenced to access the memeory.
2220            This is why we have to handle three different ldr_got_small
2221            patterns here (two patterns for ILP32).  */
2222
2223         rtx insn;
2224         rtx mem;
2225         rtx tmp_reg = dest;
2226         machine_mode mode = GET_MODE (dest);
2227
2228         if (can_create_pseudo_p ())
2229           tmp_reg = gen_reg_rtx (mode);
2230
2231         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2232         if (mode == ptr_mode)
2233           {
2234             if (mode == DImode)
2235               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2236             else
2237               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2238
2239             mem = XVECEXP (SET_SRC (insn), 0, 0);
2240           }
2241         else
2242           {
2243             gcc_assert (mode == Pmode);
2244
2245             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2246             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2247           }
2248
2249         gcc_assert (GET_CODE (mem) == MEM);
2250         MEM_READONLY_P (mem) = 1;
2251         MEM_NOTRAP_P (mem) = 1;
2252         emit_insn (insn);
2253         return;
2254       }
2255
2256     case SYMBOL_SMALL_TLSGD:
2257       {
2258         rtx_insn *insns;
2259         machine_mode mode = GET_MODE (dest);
2260         rtx result = gen_rtx_REG (mode, R0_REGNUM);
2261
2262         start_sequence ();
2263         if (TARGET_ILP32)
2264           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2265         else
2266           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2267         insns = get_insns ();
2268         end_sequence ();
2269
2270         RTL_CONST_CALL_P (insns) = 1;
2271         emit_libcall_block (insns, dest, result, imm);
2272         return;
2273       }
2274
2275     case SYMBOL_SMALL_TLSDESC:
2276       {
2277         machine_mode mode = GET_MODE (dest);
2278         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2279         rtx tp;
2280
2281         gcc_assert (mode == Pmode || mode == ptr_mode);
2282
2283         /* In ILP32, the got entry is always of SImode size.  Unlike
2284            small GOT, the dest is fixed at reg 0.  */
2285         if (TARGET_ILP32)
2286           emit_insn (gen_tlsdesc_small_si (imm));
2287         else
2288           emit_insn (gen_tlsdesc_small_di (imm));
2289         tp = aarch64_load_tp (NULL);
2290
2291         if (mode != Pmode)
2292           tp = gen_lowpart (mode, tp);
2293
2294         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2295         if (REG_P (dest))
2296           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2297         return;
2298       }
2299
2300     case SYMBOL_SMALL_TLSIE:
2301       {
2302         /* In ILP32, the mode of dest can be either SImode or DImode,
2303            while the got entry is always of SImode size.  The mode of
2304            dest depends on how dest is used: if dest is assigned to a
2305            pointer (e.g. in the memory), it has SImode; it may have
2306            DImode if dest is dereferenced to access the memeory.
2307            This is why we have to handle three different tlsie_small
2308            patterns here (two patterns for ILP32).  */
2309         machine_mode mode = GET_MODE (dest);
2310         rtx tmp_reg = gen_reg_rtx (mode);
2311         rtx tp = aarch64_load_tp (NULL);
2312
2313         if (mode == ptr_mode)
2314           {
2315             if (mode == DImode)
2316               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2317             else
2318               {
2319                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2320                 tp = gen_lowpart (mode, tp);
2321               }
2322           }
2323         else
2324           {
2325             gcc_assert (mode == Pmode);
2326             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2327           }
2328
2329         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2330         if (REG_P (dest))
2331           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2332         return;
2333       }
2334
2335     case SYMBOL_TLSLE12:
2336     case SYMBOL_TLSLE24:
2337     case SYMBOL_TLSLE32:
2338     case SYMBOL_TLSLE48:
2339       {
2340         machine_mode mode = GET_MODE (dest);
2341         rtx tp = aarch64_load_tp (NULL);
2342
2343         if (mode != Pmode)
2344           tp = gen_lowpart (mode, tp);
2345
2346         switch (type)
2347           {
2348           case SYMBOL_TLSLE12:
2349             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2350                         (dest, tp, imm));
2351             break;
2352           case SYMBOL_TLSLE24:
2353             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2354                         (dest, tp, imm));
2355           break;
2356           case SYMBOL_TLSLE32:
2357             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2358                         (dest, imm));
2359             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2360                         (dest, dest, tp));
2361           break;
2362           case SYMBOL_TLSLE48:
2363             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2364                         (dest, imm));
2365             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2366                         (dest, dest, tp));
2367             break;
2368           default:
2369             gcc_unreachable ();
2370           }
2371
2372         if (REG_P (dest))
2373           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2374         return;
2375       }
2376
2377     case SYMBOL_TINY_GOT:
2378       emit_insn (gen_ldr_got_tiny (dest, imm));
2379       return;
2380
2381     case SYMBOL_TINY_TLSIE:
2382       {
2383         machine_mode mode = GET_MODE (dest);
2384         rtx tp = aarch64_load_tp (NULL);
2385
2386         if (mode == ptr_mode)
2387           {
2388             if (mode == DImode)
2389               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2390             else
2391               {
2392                 tp = gen_lowpart (mode, tp);
2393                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2394               }
2395           }
2396         else
2397           {
2398             gcc_assert (mode == Pmode);
2399             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2400           }
2401
2402         if (REG_P (dest))
2403           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2404         return;
2405       }
2406
2407     default:
2408       gcc_unreachable ();
2409     }
2410 }
2411
2412 /* Emit a move from SRC to DEST.  Assume that the move expanders can
2413    handle all moves if !can_create_pseudo_p ().  The distinction is
2414    important because, unlike emit_move_insn, the move expanders know
2415    how to force Pmode objects into the constant pool even when the
2416    constant pool address is not itself legitimate.  */
2417 static rtx
2418 aarch64_emit_move (rtx dest, rtx src)
2419 {
2420   return (can_create_pseudo_p ()
2421           ? emit_move_insn (dest, src)
2422           : emit_move_insn_1 (dest, src));
2423 }
2424
2425 /* Apply UNOPTAB to OP and store the result in DEST.  */
2426
2427 static void
2428 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2429 {
2430   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2431   if (dest != tmp)
2432     emit_move_insn (dest, tmp);
2433 }
2434
2435 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
2436
2437 static void
2438 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2439 {
2440   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2441                           OPTAB_DIRECT);
2442   if (dest != tmp)
2443     emit_move_insn (dest, tmp);
2444 }
2445
2446 /* Split a 128-bit move operation into two 64-bit move operations,
2447    taking care to handle partial overlap of register to register
2448    copies.  Special cases are needed when moving between GP regs and
2449    FP regs.  SRC can be a register, constant or memory; DST a register
2450    or memory.  If either operand is memory it must not have any side
2451    effects.  */
2452 void
2453 aarch64_split_128bit_move (rtx dst, rtx src)
2454 {
2455   rtx dst_lo, dst_hi;
2456   rtx src_lo, src_hi;
2457
2458   machine_mode mode = GET_MODE (dst);
2459
2460   gcc_assert (mode == TImode || mode == TFmode);
2461   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2462   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2463
2464   if (REG_P (dst) && REG_P (src))
2465     {
2466       int src_regno = REGNO (src);
2467       int dst_regno = REGNO (dst);
2468
2469       /* Handle FP <-> GP regs.  */
2470       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2471         {
2472           src_lo = gen_lowpart (word_mode, src);
2473           src_hi = gen_highpart (word_mode, src);
2474
2475           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2476           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2477           return;
2478         }
2479       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2480         {
2481           dst_lo = gen_lowpart (word_mode, dst);
2482           dst_hi = gen_highpart (word_mode, dst);
2483
2484           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2485           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2486           return;
2487         }
2488     }
2489
2490   dst_lo = gen_lowpart (word_mode, dst);
2491   dst_hi = gen_highpart (word_mode, dst);
2492   src_lo = gen_lowpart (word_mode, src);
2493   src_hi = gen_highpart_mode (word_mode, mode, src);
2494
2495   /* At most one pairing may overlap.  */
2496   if (reg_overlap_mentioned_p (dst_lo, src_hi))
2497     {
2498       aarch64_emit_move (dst_hi, src_hi);
2499       aarch64_emit_move (dst_lo, src_lo);
2500     }
2501   else
2502     {
2503       aarch64_emit_move (dst_lo, src_lo);
2504       aarch64_emit_move (dst_hi, src_hi);
2505     }
2506 }
2507
2508 bool
2509 aarch64_split_128bit_move_p (rtx dst, rtx src)
2510 {
2511   return (! REG_P (src)
2512           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2513 }
2514
2515 /* Split a complex SIMD combine.  */
2516
2517 void
2518 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2519 {
2520   machine_mode src_mode = GET_MODE (src1);
2521   machine_mode dst_mode = GET_MODE (dst);
2522
2523   gcc_assert (VECTOR_MODE_P (dst_mode));
2524   gcc_assert (register_operand (dst, dst_mode)
2525               && register_operand (src1, src_mode)
2526               && register_operand (src2, src_mode));
2527
2528   emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2529   return;
2530 }
2531
2532 /* Split a complex SIMD move.  */
2533
2534 void
2535 aarch64_split_simd_move (rtx dst, rtx src)
2536 {
2537   machine_mode src_mode = GET_MODE (src);
2538   machine_mode dst_mode = GET_MODE (dst);
2539
2540   gcc_assert (VECTOR_MODE_P (dst_mode));
2541
2542   if (REG_P (dst) && REG_P (src))
2543     {
2544       gcc_assert (VECTOR_MODE_P (src_mode));
2545       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2546     }
2547 }
2548
2549 bool
2550 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2551                               machine_mode ymode, rtx y)
2552 {
2553   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2554   gcc_assert (r != NULL);
2555   return rtx_equal_p (x, r);
2556 }
2557
2558
2559 /* Return TARGET if it is nonnull and a register of mode MODE.
2560    Otherwise, return a fresh register of mode MODE if we can,
2561    or TARGET reinterpreted as MODE if we can't.  */
2562
2563 static rtx
2564 aarch64_target_reg (rtx target, machine_mode mode)
2565 {
2566   if (target && REG_P (target) && GET_MODE (target) == mode)
2567     return target;
2568   if (!can_create_pseudo_p ())
2569     {
2570       gcc_assert (target);
2571       return gen_lowpart (mode, target);
2572     }
2573   return gen_reg_rtx (mode);
2574 }
2575
2576 /* Return a register that contains the constant in BUILDER, given that
2577    the constant is a legitimate move operand.  Use TARGET as the register
2578    if it is nonnull and convenient.  */
2579
2580 static rtx
2581 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
2582 {
2583   rtx src = builder.build ();
2584   target = aarch64_target_reg (target, GET_MODE (src));
2585   emit_insn (gen_rtx_SET (target, src));
2586   return target;
2587 }
2588
2589 static rtx
2590 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2591 {
2592   if (can_create_pseudo_p ())
2593     return force_reg (mode, value);
2594   else
2595     {
2596       gcc_assert (x);
2597       aarch64_emit_move (x, value);
2598       return x;
2599     }
2600 }
2601
2602 /* Return true if predicate value X is a constant in which every element
2603    is a CONST_INT.  When returning true, describe X in BUILDER as a VNx16BI
2604    value, i.e. as a predicate in which all bits are significant.  */
2605
2606 static bool
2607 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
2608 {
2609   if (GET_CODE (x) != CONST_VECTOR)
2610     return false;
2611
2612   unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
2613                                              GET_MODE_NUNITS (GET_MODE (x)));
2614   unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
2615   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
2616   builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
2617
2618   unsigned int nelts = const_vector_encoded_nelts (x);
2619   for (unsigned int i = 0; i < nelts; ++i)
2620     {
2621       rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
2622       if (!CONST_INT_P (elt))
2623         return false;
2624
2625       builder.quick_push (elt);
2626       for (unsigned int j = 1; j < factor; ++j)
2627         builder.quick_push (const0_rtx);
2628     }
2629   builder.finalize ();
2630   return true;
2631 }
2632
2633 /* BUILDER contains a predicate constant of mode VNx16BI.  Return the
2634    widest predicate element size it can have (that is, the largest size
2635    for which each element would still be 0 or 1).  */
2636
2637 unsigned int
2638 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
2639 {
2640   /* Start with the most optimistic assumption: that we only need
2641      one bit per pattern.  This is what we will use if only the first
2642      bit in each pattern is ever set.  */
2643   unsigned int mask = GET_MODE_SIZE (DImode);
2644   mask |= builder.npatterns ();
2645
2646   /* Look for set bits.  */
2647   unsigned int nelts = builder.encoded_nelts ();
2648   for (unsigned int i = 1; i < nelts; ++i)
2649     if (INTVAL (builder.elt (i)) != 0)
2650       {
2651         if (i & 1)
2652           return 1;
2653         mask |= i;
2654       }
2655   return mask & -mask;
2656 }
2657
2658 /* BUILDER is a predicate constant of mode VNx16BI.  Consider the value
2659    that the constant would have with predicate element size ELT_SIZE
2660    (ignoring the upper bits in each element) and return:
2661
2662    * -1 if all bits are set
2663    * N if the predicate has N leading set bits followed by all clear bits
2664    * 0 if the predicate does not have any of these forms.  */
2665
2666 int
2667 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
2668                               unsigned int elt_size)
2669 {
2670   /* If nelts_per_pattern is 3, we have set bits followed by clear bits
2671      followed by set bits.  */
2672   if (builder.nelts_per_pattern () == 3)
2673     return 0;
2674
2675   /* Skip over leading set bits.  */
2676   unsigned int nelts = builder.encoded_nelts ();
2677   unsigned int i = 0;
2678   for (; i < nelts; i += elt_size)
2679     if (INTVAL (builder.elt (i)) == 0)
2680       break;
2681   unsigned int vl = i / elt_size;
2682
2683   /* Check for the all-true case.  */
2684   if (i == nelts)
2685     return -1;
2686
2687   /* If nelts_per_pattern is 1, then either VL is zero, or we have a
2688      repeating pattern of set bits followed by clear bits.  */
2689   if (builder.nelts_per_pattern () != 2)
2690     return 0;
2691
2692   /* We have a "foreground" value and a duplicated "background" value.
2693      If the background might repeat and the last set bit belongs to it,
2694      we might have set bits followed by clear bits followed by set bits.  */
2695   if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
2696     return 0;
2697
2698   /* Make sure that the rest are all clear.  */
2699   for (; i < nelts; i += elt_size)
2700     if (INTVAL (builder.elt (i)) != 0)
2701       return 0;
2702
2703   return vl;
2704 }
2705
2706 /* See if there is an svpattern that encodes an SVE predicate of mode
2707    PRED_MODE in which the first VL bits are set and the rest are clear.
2708    Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
2709    A VL of -1 indicates an all-true vector.  */
2710
2711 aarch64_svpattern
2712 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
2713 {
2714   if (vl < 0)
2715     return AARCH64_SV_ALL;
2716
2717   if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
2718     return AARCH64_NUM_SVPATTERNS;
2719
2720   if (vl >= 1 && vl <= 8)
2721     return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
2722
2723   if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
2724     return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
2725
2726   int max_vl;
2727   if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
2728     {
2729       if (vl == (max_vl / 3) * 3)
2730         return AARCH64_SV_MUL3;
2731       /* These would only trigger for non-power-of-2 lengths.  */
2732       if (vl == (max_vl & -4))
2733         return AARCH64_SV_MUL4;
2734       if (vl == (1 << floor_log2 (max_vl)))
2735         return AARCH64_SV_POW2;
2736       if (vl == max_vl)
2737         return AARCH64_SV_ALL;
2738     }
2739   return AARCH64_NUM_SVPATTERNS;
2740 }
2741
2742 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
2743    bits has the lowest bit set and the upper bits clear.  This is the
2744    VNx16BImode equivalent of a PTRUE for controlling elements of
2745    ELT_SIZE bytes.  However, because the constant is VNx16BImode,
2746    all bits are significant, even the upper zeros.  */
2747
2748 rtx
2749 aarch64_ptrue_all (unsigned int elt_size)
2750 {
2751   rtx_vector_builder builder (VNx16BImode, elt_size, 1);
2752   builder.quick_push (const1_rtx);
2753   for (unsigned int i = 1; i < elt_size; ++i)
2754     builder.quick_push (const0_rtx);
2755   return builder.build ();
2756 }
2757
2758 /* Return an all-true predicate register of mode MODE.  */
2759
2760 rtx
2761 aarch64_ptrue_reg (machine_mode mode)
2762 {
2763   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2764   rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
2765   return gen_lowpart (mode, reg);
2766 }
2767
2768 /* Return an all-false predicate register of mode MODE.  */
2769
2770 rtx
2771 aarch64_pfalse_reg (machine_mode mode)
2772 {
2773   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2774   rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
2775   return gen_lowpart (mode, reg);
2776 }
2777
2778 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
2779    true, or alternatively if we know that the operation predicated by
2780    PRED1[0] is safe to perform whenever PRED2 is true.  PRED1[1] is a
2781    aarch64_sve_gp_strictness operand that describes the operation
2782    predicated by PRED1[0].  */
2783
2784 bool
2785 aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
2786 {
2787   machine_mode mode = GET_MODE (pred2);
2788   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2789               && mode == GET_MODE (pred1[0])
2790               && aarch64_sve_gp_strictness (pred1[1], SImode));
2791   return (pred1[0] == CONSTM1_RTX (mode)
2792           || INTVAL (pred1[1]) == SVE_RELAXED_GP
2793           || rtx_equal_p (pred1[0], pred2));
2794 }
2795
2796 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
2797    for it.  PRED2[0] is the predicate for the instruction whose result
2798    is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
2799    for it.  Return true if we can prove that the two predicates are
2800    equivalent for PTEST purposes; that is, if we can replace PRED2[0]
2801    with PRED1[0] without changing behavior.  */
2802
2803 bool
2804 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
2805 {
2806   machine_mode mode = GET_MODE (pred1[0]);
2807   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2808               && mode == GET_MODE (pred2[0])
2809               && aarch64_sve_ptrue_flag (pred1[1], SImode)
2810               && aarch64_sve_ptrue_flag (pred2[1], SImode));
2811
2812   bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
2813                    || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
2814   bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
2815                    || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
2816   return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
2817 }
2818
2819 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
2820    DATA_MODE, and return the result in a predicate of mode PRED_MODE.
2821    Use TARGET as the target register if nonnull and convenient.  */
2822
2823 static rtx
2824 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
2825                           machine_mode data_mode, rtx op1, rtx op2)
2826 {
2827   insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
2828   expand_operand ops[5];
2829   create_output_operand (&ops[0], target, pred_mode);
2830   create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
2831   create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
2832   create_input_operand (&ops[3], op1, data_mode);
2833   create_input_operand (&ops[4], op2, data_mode);
2834   expand_insn (icode, 5, ops);
2835   return ops[0].value;
2836 }
2837
2838 /* Use a comparison to convert integer vector SRC into MODE, which is
2839    the corresponding SVE predicate mode.  Use TARGET for the result
2840    if it's nonnull and convenient.  */
2841
2842 static rtx
2843 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
2844 {
2845   machine_mode src_mode = GET_MODE (src);
2846   return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
2847                                    src, CONST0_RTX (src_mode));
2848 }
2849
2850 /* Return true if we can move VALUE into a register using a single
2851    CNT[BHWD] instruction.  */
2852
2853 static bool
2854 aarch64_sve_cnt_immediate_p (poly_int64 value)
2855 {
2856   HOST_WIDE_INT factor = value.coeffs[0];
2857   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2858   return (value.coeffs[1] == factor
2859           && IN_RANGE (factor, 2, 16 * 16)
2860           && (factor & 1) == 0
2861           && factor <= 16 * (factor & -factor));
2862 }
2863
2864 /* Likewise for rtx X.  */
2865
2866 bool
2867 aarch64_sve_cnt_immediate_p (rtx x)
2868 {
2869   poly_int64 value;
2870   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2871 }
2872
2873 /* Return the asm string for an instruction with a CNT-like vector size
2874    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2875    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2876    first part of the operands template (the part that comes before the
2877    vector size itself).  FACTOR is the number of quadwords.
2878    NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2879    If it is zero, we can use any element size.  */
2880
2881 static char *
2882 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2883                                   unsigned int factor,
2884                                   unsigned int nelts_per_vq)
2885 {
2886   static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2887
2888   if (nelts_per_vq == 0)
2889     /* There is some overlap in the ranges of the four CNT instructions.
2890        Here we always use the smallest possible element size, so that the
2891        multiplier is 1 whereever possible.  */
2892     nelts_per_vq = factor & -factor;
2893   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2894   gcc_assert (IN_RANGE (shift, 1, 4));
2895   char suffix = "dwhb"[shift - 1];
2896
2897   factor >>= shift;
2898   unsigned int written;
2899   if (factor == 1)
2900     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2901                         prefix, suffix, operands);
2902   else
2903     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2904                         prefix, suffix, operands, factor);
2905   gcc_assert (written < sizeof (buffer));
2906   return buffer;
2907 }
2908
2909 /* Return the asm string for an instruction with a CNT-like vector size
2910    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2911    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2912    first part of the operands template (the part that comes before the
2913    vector size itself).  X is the value of the vector size operand,
2914    as a polynomial integer rtx.  */
2915
2916 char *
2917 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2918                                   rtx x)
2919 {
2920   poly_int64 value = rtx_to_poly_int64 (x);
2921   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2922   return aarch64_output_sve_cnt_immediate (prefix, operands,
2923                                            value.coeffs[1], 0);
2924 }
2925
2926 /* Return true if we can add VALUE to a register using a single ADDVL
2927    or ADDPL instruction.  */
2928
2929 static bool
2930 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2931 {
2932   HOST_WIDE_INT factor = value.coeffs[0];
2933   if (factor == 0 || value.coeffs[1] != factor)
2934     return false;
2935   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2936      and a value of 16 is one vector width.  */
2937   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2938           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2939 }
2940
2941 /* Likewise for rtx X.  */
2942
2943 bool
2944 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2945 {
2946   poly_int64 value;
2947   return (poly_int_rtx_p (x, &value)
2948           && aarch64_sve_addvl_addpl_immediate_p (value));
2949 }
2950
2951 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2952    and storing the result in operand 0.  */
2953
2954 char *
2955 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2956 {
2957   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2958   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2959   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2960
2961   /* Use INC or DEC if possible.  */
2962   if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2963     {
2964       if (aarch64_sve_cnt_immediate_p (offset_value))
2965         return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2966                                                  offset_value.coeffs[1], 0);
2967       if (aarch64_sve_cnt_immediate_p (-offset_value))
2968         return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2969                                                  -offset_value.coeffs[1], 0);
2970     }
2971
2972   int factor = offset_value.coeffs[1];
2973   if ((factor & 15) == 0)
2974     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2975   else
2976     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2977   return buffer;
2978 }
2979
2980 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2981    instruction.  If it is, store the number of elements in each vector
2982    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2983    factor in *FACTOR_OUT (if nonnull).  */
2984
2985 bool
2986 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2987                                  unsigned int *nelts_per_vq_out)
2988 {
2989   rtx elt;
2990   poly_int64 value;
2991
2992   if (!const_vec_duplicate_p (x, &elt)
2993       || !poly_int_rtx_p (elt, &value))
2994     return false;
2995
2996   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2997   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2998     /* There's no vector INCB.  */
2999     return false;
3000
3001   HOST_WIDE_INT factor = value.coeffs[0];
3002   if (value.coeffs[1] != factor)
3003     return false;
3004
3005   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
3006   if ((factor % nelts_per_vq) != 0
3007       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
3008     return false;
3009
3010   if (factor_out)
3011     *factor_out = factor;
3012   if (nelts_per_vq_out)
3013     *nelts_per_vq_out = nelts_per_vq;
3014   return true;
3015 }
3016
3017 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3018    instruction.  */
3019
3020 bool
3021 aarch64_sve_inc_dec_immediate_p (rtx x)
3022 {
3023   return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
3024 }
3025
3026 /* Return the asm template for an SVE vector INC or DEC instruction.
3027    OPERANDS gives the operands before the vector count and X is the
3028    value of the vector count operand itself.  */
3029
3030 char *
3031 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
3032 {
3033   int factor;
3034   unsigned int nelts_per_vq;
3035   if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
3036     gcc_unreachable ();
3037   if (factor < 0)
3038     return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
3039                                              nelts_per_vq);
3040   else
3041     return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
3042                                              nelts_per_vq);
3043 }
3044
3045 static int
3046 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
3047                                 scalar_int_mode mode)
3048 {
3049   int i;
3050   unsigned HOST_WIDE_INT val, val2, mask;
3051   int one_match, zero_match;
3052   int num_insns;
3053
3054   val = INTVAL (imm);
3055
3056   if (aarch64_move_imm (val, mode))
3057     {
3058       if (generate)
3059         emit_insn (gen_rtx_SET (dest, imm));
3060       return 1;
3061     }
3062
3063   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3064      (with XXXX non-zero). In that case check to see if the move can be done in
3065      a smaller mode.  */
3066   val2 = val & 0xffffffff;
3067   if (mode == DImode
3068       && aarch64_move_imm (val2, SImode)
3069       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
3070     {
3071       if (generate)
3072         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3073
3074       /* Check if we have to emit a second instruction by checking to see
3075          if any of the upper 32 bits of the original DI mode value is set.  */
3076       if (val == val2)
3077         return 1;
3078
3079       i = (val >> 48) ? 48 : 32;
3080
3081       if (generate)
3082          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3083                                     GEN_INT ((val >> i) & 0xffff)));
3084
3085       return 2;
3086     }
3087
3088   if ((val >> 32) == 0 || mode == SImode)
3089     {
3090       if (generate)
3091         {
3092           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
3093           if (mode == SImode)
3094             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
3095                                        GEN_INT ((val >> 16) & 0xffff)));
3096           else
3097             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
3098                                        GEN_INT ((val >> 16) & 0xffff)));
3099         }
3100       return 2;
3101     }
3102
3103   /* Remaining cases are all for DImode.  */
3104
3105   mask = 0xffff;
3106   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
3107     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
3108   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
3109     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
3110
3111   if (zero_match != 2 && one_match != 2)
3112     {
3113       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3114          For a 64-bit bitmask try whether changing 16 bits to all ones or
3115          zeroes creates a valid bitmask.  To check any repeated bitmask,
3116          try using 16 bits from the other 32-bit half of val.  */
3117
3118       for (i = 0; i < 64; i += 16, mask <<= 16)
3119         {
3120           val2 = val & ~mask;
3121           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3122             break;
3123           val2 = val | mask;
3124           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3125             break;
3126           val2 = val2 & ~mask;
3127           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
3128           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3129             break;
3130         }
3131       if (i != 64)
3132         {
3133           if (generate)
3134             {
3135               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3136               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3137                                          GEN_INT ((val >> i) & 0xffff)));
3138             }
3139           return 2;
3140         }
3141     }
3142
3143   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3144      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
3145      otherwise skip zero bits.  */
3146
3147   num_insns = 1;
3148   mask = 0xffff;
3149   val2 = one_match > zero_match ? ~val : val;
3150   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
3151
3152   if (generate)
3153     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
3154                                            ? (val | ~(mask << i))
3155                                            : (val & (mask << i)))));
3156   for (i += 16; i < 64; i += 16)
3157     {
3158       if ((val2 & (mask << i)) == 0)
3159         continue;
3160       if (generate)
3161         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3162                                    GEN_INT ((val >> i) & 0xffff)));
3163       num_insns ++;
3164     }
3165
3166   return num_insns;
3167 }
3168
3169 /* Return whether imm is a 128-bit immediate which is simple enough to
3170    expand inline.  */
3171 bool
3172 aarch64_mov128_immediate (rtx imm)
3173 {
3174   if (GET_CODE (imm) == CONST_INT)
3175     return true;
3176
3177   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
3178
3179   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
3180   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
3181
3182   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
3183          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
3184 }
3185
3186
3187 /* Return the number of temporary registers that aarch64_add_offset_1
3188    would need to add OFFSET to a register.  */
3189
3190 static unsigned int
3191 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
3192 {
3193   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
3194 }
3195
3196 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
3197    a non-polynomial OFFSET.  MODE is the mode of the addition.
3198    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3199    be set and CFA adjustments added to the generated instructions.
3200
3201    TEMP1, if nonnull, is a register of mode MODE that can be used as a
3202    temporary if register allocation is already complete.  This temporary
3203    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
3204    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3205    the immediate again.
3206
3207    Since this function may be used to adjust the stack pointer, we must
3208    ensure that it cannot cause transient stack deallocation (for example
3209    by first incrementing SP and then decrementing when adjusting by a
3210    large immediate).  */
3211
3212 static void
3213 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
3214                       rtx src, HOST_WIDE_INT offset, rtx temp1,
3215                       bool frame_related_p, bool emit_move_imm)
3216 {
3217   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3218   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3219
3220   HOST_WIDE_INT moffset = abs_hwi (offset);
3221   rtx_insn *insn;
3222
3223   if (!moffset)
3224     {
3225       if (!rtx_equal_p (dest, src))
3226         {
3227           insn = emit_insn (gen_rtx_SET (dest, src));
3228           RTX_FRAME_RELATED_P (insn) = frame_related_p;
3229         }
3230       return;
3231     }
3232
3233   /* Single instruction adjustment.  */
3234   if (aarch64_uimm12_shift (moffset))
3235     {
3236       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
3237       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3238       return;
3239     }
3240
3241   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3242      and either:
3243
3244      a) the offset cannot be loaded by a 16-bit move or
3245      b) there is no spare register into which we can move it.  */
3246   if (moffset < 0x1000000
3247       && ((!temp1 && !can_create_pseudo_p ())
3248           || !aarch64_move_imm (moffset, mode)))
3249     {
3250       HOST_WIDE_INT low_off = moffset & 0xfff;
3251
3252       low_off = offset < 0 ? -low_off : low_off;
3253       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
3254       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3255       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
3256       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3257       return;
3258     }
3259
3260   /* Emit a move immediate if required and an addition/subtraction.  */
3261   if (emit_move_imm)
3262     {
3263       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
3264       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
3265     }
3266   insn = emit_insn (offset < 0
3267                     ? gen_sub3_insn (dest, src, temp1)
3268                     : gen_add3_insn (dest, src, temp1));
3269   if (frame_related_p)
3270     {
3271       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3272       rtx adj = plus_constant (mode, src, offset);
3273       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
3274     }
3275 }
3276
3277 /* Return the number of temporary registers that aarch64_add_offset
3278    would need to move OFFSET into a register or add OFFSET to a register;
3279    ADD_P is true if we want the latter rather than the former.  */
3280
3281 static unsigned int
3282 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
3283 {
3284   /* This follows the same structure as aarch64_add_offset.  */
3285   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
3286     return 0;
3287
3288   unsigned int count = 0;
3289   HOST_WIDE_INT factor = offset.coeffs[1];
3290   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3291   poly_int64 poly_offset (factor, factor);
3292   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3293     /* Need one register for the ADDVL/ADDPL result.  */
3294     count += 1;
3295   else if (factor != 0)
3296     {
3297       factor = abs (factor);
3298       if (factor > 16 * (factor & -factor))
3299         /* Need one register for the CNT result and one for the multiplication
3300            factor.  If necessary, the second temporary can be reused for the
3301            constant part of the offset.  */
3302         return 2;
3303       /* Need one register for the CNT result (which might then
3304          be shifted).  */
3305       count += 1;
3306     }
3307   return count + aarch64_add_offset_1_temporaries (constant);
3308 }
3309
3310 /* If X can be represented as a poly_int64, return the number
3311    of temporaries that are required to add it to a register.
3312    Return -1 otherwise.  */
3313
3314 int
3315 aarch64_add_offset_temporaries (rtx x)
3316 {
3317   poly_int64 offset;
3318   if (!poly_int_rtx_p (x, &offset))
3319     return -1;
3320   return aarch64_offset_temporaries (true, offset);
3321 }
3322
3323 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
3324    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3325    be set and CFA adjustments added to the generated instructions.
3326
3327    TEMP1, if nonnull, is a register of mode MODE that can be used as a
3328    temporary if register allocation is already complete.  This temporary
3329    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3330    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3331    false to avoid emitting the immediate again.
3332
3333    TEMP2, if nonnull, is a second temporary register that doesn't
3334    overlap either DEST or REG.
3335
3336    Since this function may be used to adjust the stack pointer, we must
3337    ensure that it cannot cause transient stack deallocation (for example
3338    by first incrementing SP and then decrementing when adjusting by a
3339    large immediate).  */
3340
3341 static void
3342 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3343                     poly_int64 offset, rtx temp1, rtx temp2,
3344                     bool frame_related_p, bool emit_move_imm = true)
3345 {
3346   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3347   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3348   gcc_assert (temp1 == NULL_RTX
3349               || !frame_related_p
3350               || !reg_overlap_mentioned_p (temp1, dest));
3351   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
3352
3353   /* Try using ADDVL or ADDPL to add the whole value.  */
3354   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
3355     {
3356       rtx offset_rtx = gen_int_mode (offset, mode);
3357       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3358       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3359       return;
3360     }
3361
3362   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3363      SVE vector register, over and above the minimum size of 128 bits.
3364      This is equivalent to half the value returned by CNTD with a
3365      vector shape of ALL.  */
3366   HOST_WIDE_INT factor = offset.coeffs[1];
3367   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3368
3369   /* Try using ADDVL or ADDPL to add the VG-based part.  */
3370   poly_int64 poly_offset (factor, factor);
3371   if (src != const0_rtx
3372       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3373     {
3374       rtx offset_rtx = gen_int_mode (poly_offset, mode);
3375       if (frame_related_p)
3376         {
3377           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3378           RTX_FRAME_RELATED_P (insn) = true;
3379           src = dest;
3380         }
3381       else
3382         {
3383           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
3384           src = aarch64_force_temporary (mode, temp1, addr);
3385           temp1 = temp2;
3386           temp2 = NULL_RTX;
3387         }
3388     }
3389   /* Otherwise use a CNT-based sequence.  */
3390   else if (factor != 0)
3391     {
3392       /* Use a subtraction if we have a negative factor.  */
3393       rtx_code code = PLUS;
3394       if (factor < 0)
3395         {
3396           factor = -factor;
3397           code = MINUS;
3398         }
3399
3400       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
3401          into the multiplication.  */
3402       rtx val;
3403       int shift = 0;
3404       if (factor & 1)
3405         /* Use a right shift by 1.  */
3406         shift = -1;
3407       else
3408         factor /= 2;
3409       HOST_WIDE_INT low_bit = factor & -factor;
3410       if (factor <= 16 * low_bit)
3411         {
3412           if (factor > 16 * 8)
3413             {
3414               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3415                  the value with the minimum multiplier and shift it into
3416                  position.  */
3417               int extra_shift = exact_log2 (low_bit);
3418               shift += extra_shift;
3419               factor >>= extra_shift;
3420             }
3421           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3422         }
3423       else
3424         {
3425           /* Use CNTD, then multiply it by FACTOR.  */
3426           val = gen_int_mode (poly_int64 (2, 2), mode);
3427           val = aarch64_force_temporary (mode, temp1, val);
3428
3429           /* Go back to using a negative multiplication factor if we have
3430              no register from which to subtract.  */
3431           if (code == MINUS && src == const0_rtx)
3432             {
3433               factor = -factor;
3434               code = PLUS;
3435             }
3436           rtx coeff1 = gen_int_mode (factor, mode);
3437           coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3438           val = gen_rtx_MULT (mode, val, coeff1);
3439         }
3440
3441       if (shift > 0)
3442         {
3443           /* Multiply by 1 << SHIFT.  */
3444           val = aarch64_force_temporary (mode, temp1, val);
3445           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3446         }
3447       else if (shift == -1)
3448         {
3449           /* Divide by 2.  */
3450           val = aarch64_force_temporary (mode, temp1, val);
3451           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3452         }
3453
3454       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
3455       if (src != const0_rtx)
3456         {
3457           val = aarch64_force_temporary (mode, temp1, val);
3458           val = gen_rtx_fmt_ee (code, mode, src, val);
3459         }
3460       else if (code == MINUS)
3461         {
3462           val = aarch64_force_temporary (mode, temp1, val);
3463           val = gen_rtx_NEG (mode, val);
3464         }
3465
3466       if (constant == 0 || frame_related_p)
3467         {
3468           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3469           if (frame_related_p)
3470             {
3471               RTX_FRAME_RELATED_P (insn) = true;
3472               add_reg_note (insn, REG_CFA_ADJUST_CFA,
3473                             gen_rtx_SET (dest, plus_constant (Pmode, src,
3474                                                               poly_offset)));
3475             }
3476           src = dest;
3477           if (constant == 0)
3478             return;
3479         }
3480       else
3481         {
3482           src = aarch64_force_temporary (mode, temp1, val);
3483           temp1 = temp2;
3484           temp2 = NULL_RTX;
3485         }
3486
3487       emit_move_imm = true;
3488     }
3489
3490   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3491                         frame_related_p, emit_move_imm);
3492 }
3493
3494 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3495    than a poly_int64.  */
3496
3497 void
3498 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3499                           rtx offset_rtx, rtx temp1, rtx temp2)
3500 {
3501   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3502                       temp1, temp2, false);
3503 }
3504
3505 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3506    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
3507    if TEMP1 already contains abs (DELTA).  */
3508
3509 static inline void
3510 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3511 {
3512   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3513                       temp1, temp2, true, emit_move_imm);
3514 }
3515
3516 /* Subtract DELTA from the stack pointer, marking the instructions
3517    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
3518    if nonnull.  */
3519
3520 static inline void
3521 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3522                 bool emit_move_imm = true)
3523 {
3524   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3525                       temp1, temp2, frame_related_p, emit_move_imm);
3526 }
3527
3528 /* Set DEST to (vec_series BASE STEP).  */
3529
3530 static void
3531 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3532 {
3533   machine_mode mode = GET_MODE (dest);
3534   scalar_mode inner = GET_MODE_INNER (mode);
3535
3536   /* Each operand can be a register or an immediate in the range [-16, 15].  */
3537   if (!aarch64_sve_index_immediate_p (base))
3538     base = force_reg (inner, base);
3539   if (!aarch64_sve_index_immediate_p (step))
3540     step = force_reg (inner, step);
3541
3542   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3543 }
3544
3545 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
3546    register of mode MODE.  Use TARGET for the result if it's nonnull
3547    and convenient.
3548
3549    The two vector modes must have the same element mode.  The behavior
3550    is to duplicate architectural lane N of SRC into architectural lanes
3551    N + I * STEP of the result.  On big-endian targets, architectural
3552    lane 0 of an Advanced SIMD vector is the last element of the vector
3553    in memory layout, so for big-endian targets this operation has the
3554    effect of reversing SRC before duplicating it.  Callers need to
3555    account for this.  */
3556
3557 rtx
3558 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
3559 {
3560   machine_mode src_mode = GET_MODE (src);
3561   gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
3562   insn_code icode = (BYTES_BIG_ENDIAN
3563                      ? code_for_aarch64_vec_duplicate_vq_be (mode)
3564                      : code_for_aarch64_vec_duplicate_vq_le (mode));
3565
3566   unsigned int i = 0;
3567   expand_operand ops[3];
3568   create_output_operand (&ops[i++], target, mode);
3569   create_output_operand (&ops[i++], src, src_mode);
3570   if (BYTES_BIG_ENDIAN)
3571     {
3572       /* Create a PARALLEL describing the reversal of SRC.  */
3573       unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
3574       rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
3575                                                   nelts_per_vq - 1, -1);
3576       create_fixed_operand (&ops[i++], sel);
3577     }
3578   expand_insn (icode, i, ops);
3579   return ops[0].value;
3580 }
3581
3582 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
3583    the memory image into DEST.  Return true on success.  */
3584
3585 static bool
3586 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
3587 {
3588   src = force_const_mem (GET_MODE (src), src);
3589   if (!src)
3590     return false;
3591
3592   /* Make sure that the address is legitimate.  */
3593   if (!aarch64_sve_ld1rq_operand_p (src))
3594     {
3595       rtx addr = force_reg (Pmode, XEXP (src, 0));
3596       src = replace_equiv_address (src, addr);
3597     }
3598
3599   machine_mode mode = GET_MODE (dest);
3600   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3601   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3602   rtx ptrue = aarch64_ptrue_reg (pred_mode);
3603   emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
3604   return true;
3605 }
3606
3607 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
3608    SVE data mode and isn't a legitimate constant.  Use TARGET for the
3609    result if convenient.
3610
3611    The returned register can have whatever mode seems most natural
3612    given the contents of SRC.  */
3613
3614 static rtx
3615 aarch64_expand_sve_const_vector (rtx target, rtx src)
3616 {
3617   machine_mode mode = GET_MODE (src);
3618   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3619   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3620   scalar_mode elt_mode = GET_MODE_INNER (mode);
3621   unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
3622   unsigned int encoded_bits = npatterns * nelts_per_pattern * elt_bits;
3623
3624   if (nelts_per_pattern == 1 && encoded_bits == 128)
3625     {
3626       /* The constant is a duplicated quadword but can't be narrowed
3627          beyond a quadword.  Get the memory image of the first quadword
3628          as a 128-bit vector and try using LD1RQ to load it from memory.
3629
3630          The effect for both endiannesses is to load memory lane N into
3631          architectural lanes N + I * STEP of the result.  On big-endian
3632          targets, the layout of the 128-bit vector in an Advanced SIMD
3633          register would be different from its layout in an SVE register,
3634          but this 128-bit vector is a memory value only.  */
3635       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3636       rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
3637       if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
3638         return target;
3639     }
3640
3641   if (nelts_per_pattern == 1 && encoded_bits < 128)
3642     {
3643       /* The vector is a repeating sequence of 64 bits or fewer.
3644          See if we can load them using an Advanced SIMD move and then
3645          duplicate it to fill a vector.  This is better than using a GPR
3646          move because it keeps everything in the same register file.  */
3647       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3648       rtx_vector_builder builder (vq_mode, npatterns, 1);
3649       for (unsigned int i = 0; i < npatterns; ++i)
3650         {
3651           /* We want memory lane N to go into architectural lane N,
3652              so reverse for big-endian targets.  The DUP .Q pattern
3653              has a compensating reverse built-in.  */
3654           unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
3655           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
3656         }
3657       rtx vq_src = builder.build ();
3658       if (aarch64_simd_valid_immediate (vq_src, NULL))
3659         {
3660           vq_src = force_reg (vq_mode, vq_src);
3661           return aarch64_expand_sve_dupq (target, mode, vq_src);
3662         }
3663
3664       /* Get an integer representation of the repeating part of Advanced
3665          SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
3666          which for big-endian targets is lane-swapped wrt a normal
3667          Advanced SIMD vector.  This means that for both endiannesses,
3668          memory lane N of SVE vector SRC corresponds to architectural
3669          lane N of a register holding VQ_SRC.  This in turn means that
3670          memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
3671          as a single 128-bit value) and thus that memory lane 0 of SRC is
3672          in the lsb of the integer.  Duplicating the integer therefore
3673          ensures that memory lane N of SRC goes into architectural lane
3674          N + I * INDEX of the SVE register.  */
3675       scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
3676       rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
3677       if (elt_value)
3678         {
3679           /* Pretend that we had a vector of INT_MODE to start with.  */
3680           elt_mode = int_mode;
3681           mode = aarch64_full_sve_mode (int_mode).require ();
3682
3683           /* If the integer can be moved into a general register by a
3684              single instruction, do that and duplicate the result.  */
3685           if (CONST_INT_P (elt_value)
3686               && aarch64_move_imm (INTVAL (elt_value), elt_mode))
3687             {
3688               elt_value = force_reg (elt_mode, elt_value);
3689               return expand_vector_broadcast (mode, elt_value);
3690             }
3691         }
3692       else if (npatterns == 1)
3693         /* We're duplicating a single value, but can't do better than
3694            force it to memory and load from there.  This handles things
3695            like symbolic constants.  */
3696         elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
3697
3698       if (elt_value)
3699         {
3700           /* Load the element from memory if we can, otherwise move it into
3701              a register and use a DUP.  */
3702           rtx op = force_const_mem (elt_mode, elt_value);
3703           if (!op)
3704             op = force_reg (elt_mode, elt_value);
3705           return expand_vector_broadcast (mode, op);
3706         }
3707     }
3708
3709   /* Try using INDEX.  */
3710   rtx base, step;
3711   if (const_vec_series_p (src, &base, &step))
3712     {
3713       aarch64_expand_vec_series (target, base, step);
3714       return target;
3715     }
3716
3717   /* From here on, it's better to force the whole constant to memory
3718      if we can.  */
3719   if (GET_MODE_NUNITS (mode).is_constant ())
3720     return NULL_RTX;
3721
3722   /* Expand each pattern individually.  */
3723   gcc_assert (npatterns > 1);
3724   rtx_vector_builder builder;
3725   auto_vec<rtx, 16> vectors (npatterns);
3726   for (unsigned int i = 0; i < npatterns; ++i)
3727     {
3728       builder.new_vector (mode, 1, nelts_per_pattern);
3729       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3730         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3731       vectors.quick_push (force_reg (mode, builder.build ()));
3732     }
3733
3734   /* Use permutes to interleave the separate vectors.  */
3735   while (npatterns > 1)
3736     {
3737       npatterns /= 2;
3738       for (unsigned int i = 0; i < npatterns; ++i)
3739         {
3740           rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
3741           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3742           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3743           vectors[i] = tmp;
3744         }
3745     }
3746   gcc_assert (vectors[0] == target);
3747   return target;
3748 }
3749
3750 /* Use WHILE to set a predicate register of mode MODE in which the first
3751    VL bits are set and the rest are clear.  Use TARGET for the register
3752    if it's nonnull and convenient.  */
3753
3754 static rtx
3755 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
3756                                  unsigned int vl)
3757 {
3758   rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
3759   target = aarch64_target_reg (target, mode);
3760   emit_insn (gen_while_ult (DImode, mode, target, const0_rtx, limit));
3761   return target;
3762 }
3763
3764 static rtx
3765 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
3766
3767 /* BUILDER is a constant predicate in which the index of every set bit
3768    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
3769    by inverting every element at a multiple of ELT_SIZE and EORing the
3770    result with an ELT_SIZE PTRUE.
3771
3772    Return a register that contains the constant on success, otherwise
3773    return null.  Use TARGET as the register if it is nonnull and
3774    convenient.  */
3775
3776 static rtx
3777 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
3778                                    unsigned int elt_size)
3779 {
3780   /* Invert every element at a multiple of ELT_SIZE, keeping the
3781      other bits zero.  */
3782   rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
3783                                   builder.nelts_per_pattern ());
3784   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
3785     if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
3786       inv_builder.quick_push (const1_rtx);
3787     else
3788       inv_builder.quick_push (const0_rtx);
3789   inv_builder.finalize ();
3790
3791   /* See if we can load the constant cheaply.  */
3792   rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
3793   if (!inv)
3794     return NULL_RTX;
3795
3796   /* EOR the result with an ELT_SIZE PTRUE.  */
3797   rtx mask = aarch64_ptrue_all (elt_size);
3798   mask = force_reg (VNx16BImode, mask);
3799   target = aarch64_target_reg (target, VNx16BImode);
3800   emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
3801   return target;
3802 }
3803
3804 /* BUILDER is a constant predicate in which the index of every set bit
3805    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
3806    using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE.  Return the
3807    register on success, otherwise return null.  Use TARGET as the register
3808    if nonnull and convenient.  */
3809
3810 static rtx
3811 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
3812                                    unsigned int elt_size,
3813                                    unsigned int permute_size)
3814 {
3815   /* We're going to split the constant into two new constants A and B,
3816      with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
3817      and into B otherwise.  E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
3818
3819      A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
3820      B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
3821
3822      where _ indicates elements that will be discarded by the permute.
3823
3824      First calculate the ELT_SIZEs for A and B.  */
3825   unsigned int a_elt_size = GET_MODE_SIZE (DImode);
3826   unsigned int b_elt_size = GET_MODE_SIZE (DImode);
3827   for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
3828     if (INTVAL (builder.elt (i)) != 0)
3829       {
3830         if (i & permute_size)
3831           b_elt_size |= i - permute_size;
3832         else
3833           a_elt_size |= i;
3834       }
3835   a_elt_size &= -a_elt_size;
3836   b_elt_size &= -b_elt_size;
3837
3838   /* Now construct the vectors themselves.  */
3839   rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
3840                                 builder.nelts_per_pattern ());
3841   rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
3842                                 builder.nelts_per_pattern ());
3843   unsigned int nelts = builder.encoded_nelts ();
3844   for (unsigned int i = 0; i < nelts; ++i)
3845     if (i & (elt_size - 1))
3846       {
3847         a_builder.quick_push (const0_rtx);
3848         b_builder.quick_push (const0_rtx);
3849       }
3850     else if ((i & permute_size) == 0)
3851       {
3852         /* The A and B elements are significant.  */
3853         a_builder.quick_push (builder.elt (i));
3854         b_builder.quick_push (builder.elt (i + permute_size));
3855       }
3856     else
3857       {
3858         /* The A and B elements are going to be discarded, so pick whatever
3859            is likely to give a nice constant.  We are targeting element
3860            sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
3861            with the aim of each being a sequence of ones followed by
3862            a sequence of zeros.  So:
3863
3864            * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
3865              duplicate the last X_ELT_SIZE element, to extend the
3866              current sequence of ones or zeros.
3867
3868            * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
3869              zero, so that the constant really does have X_ELT_SIZE and
3870              not a smaller size.  */
3871         if (a_elt_size > permute_size)
3872           a_builder.quick_push (const0_rtx);
3873         else
3874           a_builder.quick_push (a_builder.elt (i - a_elt_size));
3875         if (b_elt_size > permute_size)
3876           b_builder.quick_push (const0_rtx);
3877         else
3878           b_builder.quick_push (b_builder.elt (i - b_elt_size));
3879       }
3880   a_builder.finalize ();
3881   b_builder.finalize ();
3882
3883   /* Try loading A into a register.  */
3884   rtx_insn *last = get_last_insn ();
3885   rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
3886   if (!a)
3887     return NULL_RTX;
3888
3889   /* Try loading B into a register.  */
3890   rtx b = a;
3891   if (a_builder != b_builder)
3892     {
3893       b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
3894       if (!b)
3895         {
3896           delete_insns_since (last);
3897           return NULL_RTX;
3898         }
3899     }
3900
3901   /* Emit the TRN1 itself.  */
3902   machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
3903   target = aarch64_target_reg (target, mode);
3904   emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
3905                               gen_lowpart (mode, a),
3906                               gen_lowpart (mode, b)));
3907   return target;
3908 }
3909
3910 /* Subroutine of aarch64_expand_sve_const_pred.  Try to load the VNx16BI
3911    constant in BUILDER into an SVE predicate register.  Return the register
3912    on success, otherwise return null.  Use TARGET for the register if
3913    nonnull and convenient.
3914
3915    ALLOW_RECURSE_P is true if we can use methods that would call this
3916    function recursively.  */
3917
3918 static rtx
3919 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
3920                                  bool allow_recurse_p)
3921 {
3922   if (builder.encoded_nelts () == 1)
3923     /* A PFALSE or a PTRUE .B ALL.  */
3924     return aarch64_emit_set_immediate (target, builder);
3925
3926   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
3927   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
3928     {
3929       /* If we can load the constant using PTRUE, use it as-is.  */
3930       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
3931       if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
3932         return aarch64_emit_set_immediate (target, builder);
3933
3934       /* Otherwise use WHILE to set the first VL bits.  */
3935       return aarch64_sve_move_pred_via_while (target, mode, vl);
3936     }
3937
3938   if (!allow_recurse_p)
3939     return NULL_RTX;
3940
3941   /* Try inverting the vector in element size ELT_SIZE and then EORing
3942      the result with an ELT_SIZE PTRUE.  */
3943   if (INTVAL (builder.elt (0)) == 0)
3944     if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
3945                                                      elt_size))
3946       return res;
3947
3948   /* Try using TRN1 to permute two simpler constants.  */
3949   for (unsigned int i = elt_size; i <= 8; i *= 2)
3950     if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
3951                                                      elt_size, i))
3952       return res;
3953
3954   return NULL_RTX;
3955 }
3956
3957 /* Return an SVE predicate register that contains the VNx16BImode
3958    constant in BUILDER, without going through the move expanders.
3959
3960    The returned register can have whatever mode seems most natural
3961    given the contents of BUILDER.  Use TARGET for the result if
3962    convenient.  */
3963
3964 static rtx
3965 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
3966 {
3967   /* Try loading the constant using pure predicate operations.  */
3968   if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
3969     return res;
3970
3971   /* Try forcing the constant to memory.  */
3972   if (builder.full_nelts ().is_constant ())
3973     if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
3974       {
3975         target = aarch64_target_reg (target, VNx16BImode);
3976         emit_move_insn (target, mem);
3977         return target;
3978       }
3979
3980   /* The last resort is to load the constant as an integer and then
3981      compare it against zero.  Use -1 for set bits in order to increase
3982      the changes of using SVE DUPM or an Advanced SIMD byte mask.  */
3983   rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
3984                                   builder.nelts_per_pattern ());
3985   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
3986     int_builder.quick_push (INTVAL (builder.elt (i))
3987                             ? constm1_rtx : const0_rtx);
3988   return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
3989                                            int_builder.build ());
3990 }
3991
3992 /* Set DEST to immediate IMM.  */
3993
3994 void
3995 aarch64_expand_mov_immediate (rtx dest, rtx imm)
3996 {
3997   machine_mode mode = GET_MODE (dest);
3998
3999   /* Check on what type of symbol it is.  */
4000   scalar_int_mode int_mode;
4001   if ((GET_CODE (imm) == SYMBOL_REF
4002        || GET_CODE (imm) == LABEL_REF
4003        || GET_CODE (imm) == CONST
4004        || GET_CODE (imm) == CONST_POLY_INT)
4005       && is_a <scalar_int_mode> (mode, &int_mode))
4006     {
4007       rtx mem;
4008       poly_int64 offset;
4009       HOST_WIDE_INT const_offset;
4010       enum aarch64_symbol_type sty;
4011
4012       /* If we have (const (plus symbol offset)), separate out the offset
4013          before we start classifying the symbol.  */
4014       rtx base = strip_offset (imm, &offset);
4015
4016       /* We must always add an offset involving VL separately, rather than
4017          folding it into the relocation.  */
4018       if (!offset.is_constant (&const_offset))
4019         {
4020           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
4021             emit_insn (gen_rtx_SET (dest, imm));
4022           else
4023             {
4024               /* Do arithmetic on 32-bit values if the result is smaller
4025                  than that.  */
4026               if (partial_subreg_p (int_mode, SImode))
4027                 {
4028                   /* It is invalid to do symbol calculations in modes
4029                      narrower than SImode.  */
4030                   gcc_assert (base == const0_rtx);
4031                   dest = gen_lowpart (SImode, dest);
4032                   int_mode = SImode;
4033                 }
4034               if (base != const0_rtx)
4035                 {
4036                   base = aarch64_force_temporary (int_mode, dest, base);
4037                   aarch64_add_offset (int_mode, dest, base, offset,
4038                                       NULL_RTX, NULL_RTX, false);
4039                 }
4040               else
4041                 aarch64_add_offset (int_mode, dest, base, offset,
4042                                     dest, NULL_RTX, false);
4043             }
4044           return;
4045         }
4046
4047       sty = aarch64_classify_symbol (base, const_offset);
4048       switch (sty)
4049         {
4050         case SYMBOL_FORCE_TO_MEM:
4051           if (const_offset != 0
4052               && targetm.cannot_force_const_mem (int_mode, imm))
4053             {
4054               gcc_assert (can_create_pseudo_p ());
4055               base = aarch64_force_temporary (int_mode, dest, base);
4056               aarch64_add_offset (int_mode, dest, base, const_offset,
4057                                   NULL_RTX, NULL_RTX, false);
4058               return;
4059             }
4060
4061           mem = force_const_mem (ptr_mode, imm);
4062           gcc_assert (mem);
4063
4064           /* If we aren't generating PC relative literals, then
4065              we need to expand the literal pool access carefully.
4066              This is something that needs to be done in a number
4067              of places, so could well live as a separate function.  */
4068           if (!aarch64_pcrelative_literal_loads)
4069             {
4070               gcc_assert (can_create_pseudo_p ());
4071               base = gen_reg_rtx (ptr_mode);
4072               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
4073               if (ptr_mode != Pmode)
4074                 base = convert_memory_address (Pmode, base);
4075               mem = gen_rtx_MEM (ptr_mode, base);
4076             }
4077
4078           if (int_mode != ptr_mode)
4079             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
4080
4081           emit_insn (gen_rtx_SET (dest, mem));
4082
4083           return;
4084
4085         case SYMBOL_SMALL_TLSGD:
4086         case SYMBOL_SMALL_TLSDESC:
4087         case SYMBOL_SMALL_TLSIE:
4088         case SYMBOL_SMALL_GOT_28K:
4089         case SYMBOL_SMALL_GOT_4G:
4090         case SYMBOL_TINY_GOT:
4091         case SYMBOL_TINY_TLSIE:
4092           if (const_offset != 0)
4093             {
4094               gcc_assert(can_create_pseudo_p ());
4095               base = aarch64_force_temporary (int_mode, dest, base);
4096               aarch64_add_offset (int_mode, dest, base, const_offset,
4097                                   NULL_RTX, NULL_RTX, false);
4098               return;
4099             }
4100           /* FALLTHRU */
4101
4102         case SYMBOL_SMALL_ABSOLUTE:
4103         case SYMBOL_TINY_ABSOLUTE:
4104         case SYMBOL_TLSLE12:
4105         case SYMBOL_TLSLE24:
4106         case SYMBOL_TLSLE32:
4107         case SYMBOL_TLSLE48:
4108           aarch64_load_symref_appropriately (dest, imm, sty);
4109           return;
4110
4111         default:
4112           gcc_unreachable ();
4113         }
4114     }
4115
4116   if (!CONST_INT_P (imm))
4117     {
4118       if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
4119         {
4120           /* Only the low bit of each .H, .S and .D element is defined,
4121              so we can set the upper bits to whatever we like.  If the
4122              predicate is all-true in MODE, prefer to set all the undefined
4123              bits as well, so that we can share a single .B predicate for
4124              all modes.  */
4125           if (imm == CONSTM1_RTX (mode))
4126             imm = CONSTM1_RTX (VNx16BImode);
4127
4128           /* All methods for constructing predicate modes wider than VNx16BI
4129              will set the upper bits of each element to zero.  Expose this
4130              by moving such constants as a VNx16BI, so that all bits are
4131              significant and so that constants for different modes can be
4132              shared.  The wider constant will still be available as a
4133              REG_EQUAL note.  */
4134           rtx_vector_builder builder;
4135           if (aarch64_get_sve_pred_bits (builder, imm))
4136             {
4137               rtx res = aarch64_expand_sve_const_pred (dest, builder);
4138               if (dest != res)
4139                 emit_move_insn (dest, gen_lowpart (mode, res));
4140               return;
4141             }
4142         }
4143
4144       if (GET_CODE (imm) == HIGH
4145           || aarch64_simd_valid_immediate (imm, NULL))
4146         {
4147           emit_insn (gen_rtx_SET (dest, imm));
4148           return;
4149         }
4150
4151       if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
4152         if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
4153           {
4154             if (dest != res)
4155               emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
4156             return;
4157           }
4158
4159       rtx mem = force_const_mem (mode, imm);
4160       gcc_assert (mem);
4161       emit_move_insn (dest, mem);
4162       return;
4163     }
4164
4165   aarch64_internal_mov_immediate (dest, imm, true,
4166                                   as_a <scalar_int_mode> (mode));
4167 }
4168
4169 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
4170    that is known to contain PTRUE.  */
4171
4172 void
4173 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
4174 {
4175   expand_operand ops[3];
4176   machine_mode mode = GET_MODE (dest);
4177   create_output_operand (&ops[0], dest, mode);
4178   create_input_operand (&ops[1], pred, GET_MODE(pred));
4179   create_input_operand (&ops[2], src, mode);
4180   temporary_volatile_ok v (true);
4181   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
4182 }
4183
4184 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
4185    operand is in memory.  In this case we need to use the predicated LD1
4186    and ST1 instead of LDR and STR, both for correctness on big-endian
4187    targets and because LD1 and ST1 support a wider range of addressing modes.
4188    PRED_MODE is the mode of the predicate.
4189
4190    See the comment at the head of aarch64-sve.md for details about the
4191    big-endian handling.  */
4192
4193 void
4194 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
4195 {
4196   machine_mode mode = GET_MODE (dest);
4197   rtx ptrue = aarch64_ptrue_reg (pred_mode);
4198   if (!register_operand (src, mode)
4199       && !register_operand (dest, mode))
4200     {
4201       rtx tmp = gen_reg_rtx (mode);
4202       if (MEM_P (src))
4203         aarch64_emit_sve_pred_move (tmp, ptrue, src);
4204       else
4205         emit_move_insn (tmp, src);
4206       src = tmp;
4207     }
4208   aarch64_emit_sve_pred_move (dest, ptrue, src);
4209 }
4210
4211 /* Called only on big-endian targets.  See whether an SVE vector move
4212    from SRC to DEST is effectively a REV[BHW] instruction, because at
4213    least one operand is a subreg of an SVE vector that has wider or
4214    narrower elements.  Return true and emit the instruction if so.
4215
4216    For example:
4217
4218      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4219
4220    represents a VIEW_CONVERT between the following vectors, viewed
4221    in memory order:
4222
4223      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
4224      R1: { [0],      [1],      [2],      [3],     ... }
4225
4226    The high part of lane X in R2 should therefore correspond to lane X*2
4227    of R1, but the register representations are:
4228
4229          msb                                      lsb
4230      R2: ...... [1].high  [1].low   [0].high  [0].low
4231      R1: ...... [3]       [2]       [1]       [0]
4232
4233    where the low part of lane X in R2 corresponds to lane X*2 in R1.
4234    We therefore need a reverse operation to swap the high and low values
4235    around.
4236
4237    This is purely an optimization.  Without it we would spill the
4238    subreg operand to the stack in one mode and reload it in the
4239    other mode, which has the same effect as the REV.  */
4240
4241 bool
4242 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
4243 {
4244   gcc_assert (BYTES_BIG_ENDIAN);
4245   if (GET_CODE (dest) == SUBREG)
4246     dest = SUBREG_REG (dest);
4247   if (GET_CODE (src) == SUBREG)
4248     src = SUBREG_REG (src);
4249
4250   /* The optimization handles two single SVE REGs with different element
4251      sizes.  */
4252   if (!REG_P (dest)
4253       || !REG_P (src)
4254       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
4255       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
4256       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
4257           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
4258     return false;
4259
4260   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
4261   rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
4262   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
4263                                UNSPEC_REV_SUBREG);
4264   emit_insn (gen_rtx_SET (dest, unspec));
4265   return true;
4266 }
4267
4268 /* Return a copy of X with mode MODE, without changing its other
4269    attributes.  Unlike gen_lowpart, this doesn't care whether the
4270    mode change is valid.  */
4271
4272 static rtx
4273 aarch64_replace_reg_mode (rtx x, machine_mode mode)
4274 {
4275   if (GET_MODE (x) == mode)
4276     return x;
4277
4278   x = shallow_copy_rtx (x);
4279   set_mode_and_regno (x, mode, REGNO (x));
4280   return x;
4281 }
4282
4283 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4284    operands.  */
4285
4286 void
4287 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
4288 {
4289   /* Decide which REV operation we need.  The mode with narrower elements
4290      determines the mode of the operands and the mode with the wider
4291      elements determines the reverse width.  */
4292   machine_mode mode_with_wider_elts = GET_MODE (dest);
4293   machine_mode mode_with_narrower_elts = GET_MODE (src);
4294   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
4295       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
4296     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
4297
4298   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
4299   unsigned int unspec;
4300   if (wider_bytes == 8)
4301     unspec = UNSPEC_REV64;
4302   else if (wider_bytes == 4)
4303     unspec = UNSPEC_REV32;
4304   else if (wider_bytes == 2)
4305     unspec = UNSPEC_REV16;
4306   else
4307     gcc_unreachable ();
4308   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
4309
4310   /* Emit:
4311
4312        (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)] UNSPEC_PRED_X))
4313
4314      with the appropriate modes.  */
4315   ptrue = gen_lowpart (pred_mode, ptrue);
4316   dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
4317   src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
4318   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
4319   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
4320                         UNSPEC_PRED_X);
4321   emit_insn (gen_rtx_SET (dest, src));
4322 }
4323
4324 static bool
4325 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
4326                                  tree exp ATTRIBUTE_UNUSED)
4327 {
4328   if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
4329     return false;
4330
4331   return true;
4332 }
4333
4334 /* Implement TARGET_PASS_BY_REFERENCE.  */
4335
4336 static bool
4337 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
4338                            machine_mode mode,
4339                            const_tree type,
4340                            bool named ATTRIBUTE_UNUSED)
4341 {
4342   HOST_WIDE_INT size;
4343   machine_mode dummymode;
4344   int nregs;
4345
4346   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
4347   if (mode == BLKmode && type)
4348     size = int_size_in_bytes (type);
4349   else
4350     /* No frontends can create types with variable-sized modes, so we
4351        shouldn't be asked to pass or return them.  */
4352     size = GET_MODE_SIZE (mode).to_constant ();
4353
4354   /* Aggregates are passed by reference based on their size.  */
4355   if (type && AGGREGATE_TYPE_P (type))
4356     {
4357       size = int_size_in_bytes (type);
4358     }
4359
4360   /* Variable sized arguments are always returned by reference.  */
4361   if (size < 0)
4362     return true;
4363
4364   /* Can this be a candidate to be passed in fp/simd register(s)?  */
4365   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4366                                                &dummymode, &nregs,
4367                                                NULL))
4368     return false;
4369
4370   /* Arguments which are variable sized or larger than 2 registers are
4371      passed by reference unless they are a homogenous floating point
4372      aggregate.  */
4373   return size > 2 * UNITS_PER_WORD;
4374 }
4375
4376 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
4377 static bool
4378 aarch64_return_in_msb (const_tree valtype)
4379 {
4380   machine_mode dummy_mode;
4381   int dummy_int;
4382
4383   /* Never happens in little-endian mode.  */
4384   if (!BYTES_BIG_ENDIAN)
4385     return false;
4386
4387   /* Only composite types smaller than or equal to 16 bytes can
4388      be potentially returned in registers.  */
4389   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
4390       || int_size_in_bytes (valtype) <= 0
4391       || int_size_in_bytes (valtype) > 16)
4392     return false;
4393
4394   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4395      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4396      is always passed/returned in the least significant bits of fp/simd
4397      register(s).  */
4398   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
4399                                                &dummy_mode, &dummy_int, NULL))
4400     return false;
4401
4402   return true;
4403 }
4404
4405 /* Implement TARGET_FUNCTION_VALUE.
4406    Define how to find the value returned by a function.  */
4407
4408 static rtx
4409 aarch64_function_value (const_tree type, const_tree func,
4410                         bool outgoing ATTRIBUTE_UNUSED)
4411 {
4412   machine_mode mode;
4413   int unsignedp;
4414   int count;
4415   machine_mode ag_mode;
4416
4417   mode = TYPE_MODE (type);
4418   if (INTEGRAL_TYPE_P (type))
4419     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
4420
4421   if (aarch64_return_in_msb (type))
4422     {
4423       HOST_WIDE_INT size = int_size_in_bytes (type);
4424
4425       if (size % UNITS_PER_WORD != 0)
4426         {
4427           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
4428           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
4429         }
4430     }
4431
4432   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4433                                                &ag_mode, &count, NULL))
4434     {
4435       if (!aarch64_composite_type_p (type, mode))
4436         {
4437           gcc_assert (count == 1 && mode == ag_mode);
4438           return gen_rtx_REG (mode, V0_REGNUM);
4439         }
4440       else
4441         {
4442           int i;
4443           rtx par;
4444
4445           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
4446           for (i = 0; i < count; i++)
4447             {
4448               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
4449               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
4450               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4451               XVECEXP (par, 0, i) = tmp;
4452             }
4453           return par;
4454         }
4455     }
4456   else
4457     return gen_rtx_REG (mode, R0_REGNUM);
4458 }
4459
4460 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
4461    Return true if REGNO is the number of a hard register in which the values
4462    of called function may come back.  */
4463
4464 static bool
4465 aarch64_function_value_regno_p (const unsigned int regno)
4466 {
4467   /* Maximum of 16 bytes can be returned in the general registers.  Examples
4468      of 16-byte return values are: 128-bit integers and 16-byte small
4469      structures (excluding homogeneous floating-point aggregates).  */
4470   if (regno == R0_REGNUM || regno == R1_REGNUM)
4471     return true;
4472
4473   /* Up to four fp/simd registers can return a function value, e.g. a
4474      homogeneous floating-point aggregate having four members.  */
4475   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
4476     return TARGET_FLOAT;
4477
4478   return false;
4479 }
4480
4481 /* Implement TARGET_RETURN_IN_MEMORY.
4482
4483    If the type T of the result of a function is such that
4484      void func (T arg)
4485    would require that arg be passed as a value in a register (or set of
4486    registers) according to the parameter passing rules, then the result
4487    is returned in the same registers as would be used for such an
4488    argument.  */
4489
4490 static bool
4491 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
4492 {
4493   HOST_WIDE_INT size;
4494   machine_mode ag_mode;
4495   int count;
4496
4497   if (!AGGREGATE_TYPE_P (type)
4498       && TREE_CODE (type) != COMPLEX_TYPE
4499       && TREE_CODE (type) != VECTOR_TYPE)
4500     /* Simple scalar types always returned in registers.  */
4501     return false;
4502
4503   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
4504                                                type,
4505                                                &ag_mode,
4506                                                &count,
4507                                                NULL))
4508     return false;
4509
4510   /* Types larger than 2 registers returned in memory.  */
4511   size = int_size_in_bytes (type);
4512   return (size < 0 || size > 2 * UNITS_PER_WORD);
4513 }
4514
4515 static bool
4516 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
4517                                const_tree type, int *nregs)
4518 {
4519   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4520   return aarch64_vfp_is_call_or_return_candidate (mode,
4521                                                   type,
4522                                                   &pcum->aapcs_vfp_rmode,
4523                                                   nregs,
4524                                                   NULL);
4525 }
4526
4527 /* Given MODE and TYPE of a function argument, return the alignment in
4528    bits.  The idea is to suppress any stronger alignment requested by
4529    the user and opt for the natural alignment (specified in AAPCS64 \S
4530    4.1).  ABI_BREAK is set to true if the alignment was incorrectly
4531    calculated in versions of GCC prior to GCC-9.  This is a helper
4532    function for local use only.  */
4533
4534 static unsigned int
4535 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
4536                                 bool *abi_break)
4537 {
4538   *abi_break = false;
4539   if (!type)
4540     return GET_MODE_ALIGNMENT (mode);
4541
4542   if (integer_zerop (TYPE_SIZE (type)))
4543     return 0;
4544
4545   gcc_assert (TYPE_MODE (type) == mode);
4546
4547   if (!AGGREGATE_TYPE_P (type))
4548     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
4549
4550   if (TREE_CODE (type) == ARRAY_TYPE)
4551     return TYPE_ALIGN (TREE_TYPE (type));
4552
4553   unsigned int alignment = 0;
4554   unsigned int bitfield_alignment = 0;
4555   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
4556     if (TREE_CODE (field) == FIELD_DECL)
4557       {
4558         alignment = std::max (alignment, DECL_ALIGN (field));
4559         if (DECL_BIT_FIELD_TYPE (field))
4560           bitfield_alignment
4561             = std::max (bitfield_alignment,
4562                         TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
4563       }
4564
4565   if (bitfield_alignment > alignment)
4566     {
4567       *abi_break = true;
4568       return bitfield_alignment;
4569     }
4570
4571   return alignment;
4572 }
4573
4574 /* Layout a function argument according to the AAPCS64 rules.  The rule
4575    numbers refer to the rule numbers in the AAPCS64.  */
4576
4577 static void
4578 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
4579                     const_tree type,
4580                     bool named ATTRIBUTE_UNUSED)
4581 {
4582   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4583   int ncrn, nvrn, nregs;
4584   bool allocate_ncrn, allocate_nvrn;
4585   HOST_WIDE_INT size;
4586   bool abi_break;
4587
4588   /* We need to do this once per argument.  */
4589   if (pcum->aapcs_arg_processed)
4590     return;
4591
4592   pcum->aapcs_arg_processed = true;
4593
4594   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
4595   if (type)
4596     size = int_size_in_bytes (type);
4597   else
4598     /* No frontends can create types with variable-sized modes, so we
4599        shouldn't be asked to pass or return them.  */
4600     size = GET_MODE_SIZE (mode).to_constant ();
4601   size = ROUND_UP (size, UNITS_PER_WORD);
4602
4603   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
4604   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
4605                                                  mode,
4606                                                  type,
4607                                                  &nregs);
4608
4609   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
4610      The following code thus handles passing by SIMD/FP registers first.  */
4611
4612   nvrn = pcum->aapcs_nvrn;
4613
4614   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
4615      and homogenous short-vector aggregates (HVA).  */
4616   if (allocate_nvrn)
4617     {
4618       if (!TARGET_FLOAT)
4619         aarch64_err_no_fpadvsimd (mode);
4620
4621       if (nvrn + nregs <= NUM_FP_ARG_REGS)
4622         {
4623           pcum->aapcs_nextnvrn = nvrn + nregs;
4624           if (!aarch64_composite_type_p (type, mode))
4625             {
4626               gcc_assert (nregs == 1);
4627               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
4628             }
4629           else
4630             {
4631               rtx par;
4632               int i;
4633               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4634               for (i = 0; i < nregs; i++)
4635                 {
4636                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
4637                                          V0_REGNUM + nvrn + i);
4638                   rtx offset = gen_int_mode
4639                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
4640                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4641                   XVECEXP (par, 0, i) = tmp;
4642                 }
4643               pcum->aapcs_reg = par;
4644             }
4645           return;
4646         }
4647       else
4648         {
4649           /* C.3 NSRN is set to 8.  */
4650           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
4651           goto on_stack;
4652         }
4653     }
4654
4655   ncrn = pcum->aapcs_ncrn;
4656   nregs = size / UNITS_PER_WORD;
4657
4658   /* C6 - C9.  though the sign and zero extension semantics are
4659      handled elsewhere.  This is the case where the argument fits
4660      entirely general registers.  */
4661   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
4662     {
4663       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
4664
4665       /* C.8 if the argument has an alignment of 16 then the NGRN is
4666          rounded up to the next even number.  */
4667       if (nregs == 2
4668           && ncrn % 2
4669           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
4670              comparison is there because for > 16 * BITS_PER_UNIT
4671              alignment nregs should be > 2 and therefore it should be
4672              passed by reference rather than value.  */
4673           && (aarch64_function_arg_alignment (mode, type, &abi_break)
4674               == 16 * BITS_PER_UNIT))
4675         {
4676           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4677             inform (input_location, "parameter passing for argument of type "
4678                     "%qT changed in GCC 9.1", type);
4679           ++ncrn;
4680           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
4681         }
4682
4683       /* NREGS can be 0 when e.g. an empty structure is to be passed.
4684          A reg is still generated for it, but the caller should be smart
4685          enough not to use it.  */
4686       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
4687         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
4688       else
4689         {
4690           rtx par;
4691           int i;
4692
4693           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4694           for (i = 0; i < nregs; i++)
4695             {
4696               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
4697               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
4698                                        GEN_INT (i * UNITS_PER_WORD));
4699               XVECEXP (par, 0, i) = tmp;
4700             }
4701           pcum->aapcs_reg = par;
4702         }
4703
4704       pcum->aapcs_nextncrn = ncrn + nregs;
4705       return;
4706     }
4707
4708   /* C.11  */
4709   pcum->aapcs_nextncrn = NUM_ARG_REGS;
4710
4711   /* The argument is passed on stack; record the needed number of words for
4712      this argument and align the total size if necessary.  */
4713 on_stack:
4714   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
4715
4716   if (aarch64_function_arg_alignment (mode, type, &abi_break)
4717       == 16 * BITS_PER_UNIT)
4718     {
4719       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
4720       if (pcum->aapcs_stack_size != new_size)
4721         {
4722           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4723             inform (input_location, "parameter passing for argument of type "
4724                     "%qT changed in GCC 9.1", type);
4725           pcum->aapcs_stack_size = new_size;
4726         }
4727     }
4728   return;
4729 }
4730
4731 /* Implement TARGET_FUNCTION_ARG.  */
4732
4733 static rtx
4734 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
4735                       const_tree type, bool named)
4736 {
4737   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4738   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
4739
4740   if (mode == VOIDmode)
4741     return NULL_RTX;
4742
4743   aarch64_layout_arg (pcum_v, mode, type, named);
4744   return pcum->aapcs_reg;
4745 }
4746
4747 void
4748 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
4749                            const_tree fntype ATTRIBUTE_UNUSED,
4750                            rtx libname ATTRIBUTE_UNUSED,
4751                            const_tree fndecl ATTRIBUTE_UNUSED,
4752                            unsigned n_named ATTRIBUTE_UNUSED)
4753 {
4754   pcum->aapcs_ncrn = 0;
4755   pcum->aapcs_nvrn = 0;
4756   pcum->aapcs_nextncrn = 0;
4757   pcum->aapcs_nextnvrn = 0;
4758   pcum->pcs_variant = ARM_PCS_AAPCS64;
4759   pcum->aapcs_reg = NULL_RTX;
4760   pcum->aapcs_arg_processed = false;
4761   pcum->aapcs_stack_words = 0;
4762   pcum->aapcs_stack_size = 0;
4763
4764   if (!TARGET_FLOAT
4765       && fndecl && TREE_PUBLIC (fndecl)
4766       && fntype && fntype != error_mark_node)
4767     {
4768       const_tree type = TREE_TYPE (fntype);
4769       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
4770       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
4771       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
4772                                                    &mode, &nregs, NULL))
4773         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
4774     }
4775   return;
4776 }
4777
4778 static void
4779 aarch64_function_arg_advance (cumulative_args_t pcum_v,
4780                               machine_mode mode,
4781                               const_tree type,
4782                               bool named)
4783 {
4784   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4785   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
4786     {
4787       aarch64_layout_arg (pcum_v, mode, type, named);
4788       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
4789                   != (pcum->aapcs_stack_words != 0));
4790       pcum->aapcs_arg_processed = false;
4791       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4792       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4793       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4794       pcum->aapcs_stack_words = 0;
4795       pcum->aapcs_reg = NULL_RTX;
4796     }
4797 }
4798
4799 bool
4800 aarch64_function_arg_regno_p (unsigned regno)
4801 {
4802   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4803           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4804 }
4805
4806 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
4807    PARM_BOUNDARY bits of alignment, but will be given anything up
4808    to STACK_BOUNDARY bits if the type requires it.  This makes sure
4809    that both before and after the layout of each argument, the Next
4810    Stacked Argument Address (NSAA) will have a minimum alignment of
4811    8 bytes.  */
4812
4813 static unsigned int
4814 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4815 {
4816   bool abi_break;
4817   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
4818                                                            &abi_break);
4819   if (abi_break & warn_psabi)
4820     inform (input_location, "parameter passing for argument of type "
4821             "%qT changed in GCC 9.1", type);
4822
4823   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4824 }
4825
4826 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
4827
4828 static fixed_size_mode
4829 aarch64_get_reg_raw_mode (int regno)
4830 {
4831   if (TARGET_SVE && FP_REGNUM_P (regno))
4832     /* Don't use the SVE part of the register for __builtin_apply and
4833        __builtin_return.  The SVE registers aren't used by the normal PCS,
4834        so using them there would be a waste of time.  The PCS extensions
4835        for SVE types are fundamentally incompatible with the
4836        __builtin_return/__builtin_apply interface.  */
4837     return as_a <fixed_size_mode> (V16QImode);
4838   return default_get_reg_raw_mode (regno);
4839 }
4840
4841 /* Implement TARGET_FUNCTION_ARG_PADDING.
4842
4843    Small aggregate types are placed in the lowest memory address.
4844
4845    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
4846
4847 static pad_direction
4848 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4849 {
4850   /* On little-endian targets, the least significant byte of every stack
4851      argument is passed at the lowest byte address of the stack slot.  */
4852   if (!BYTES_BIG_ENDIAN)
4853     return PAD_UPWARD;
4854
4855   /* Otherwise, integral, floating-point and pointer types are padded downward:
4856      the least significant byte of a stack argument is passed at the highest
4857      byte address of the stack slot.  */
4858   if (type
4859       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4860          || POINTER_TYPE_P (type))
4861       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4862     return PAD_DOWNWARD;
4863
4864   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
4865   return PAD_UPWARD;
4866 }
4867
4868 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4869
4870    It specifies padding for the last (may also be the only)
4871    element of a block move between registers and memory.  If
4872    assuming the block is in the memory, padding upward means that
4873    the last element is padded after its highest significant byte,
4874    while in downward padding, the last element is padded at the
4875    its least significant byte side.
4876
4877    Small aggregates and small complex types are always padded
4878    upwards.
4879
4880    We don't need to worry about homogeneous floating-point or
4881    short-vector aggregates; their move is not affected by the
4882    padding direction determined here.  Regardless of endianness,
4883    each element of such an aggregate is put in the least
4884    significant bits of a fp/simd register.
4885
4886    Return !BYTES_BIG_ENDIAN if the least significant byte of the
4887    register has useful data, and return the opposite if the most
4888    significant byte does.  */
4889
4890 bool
4891 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4892                      bool first ATTRIBUTE_UNUSED)
4893 {
4894
4895   /* Small composite types are always padded upward.  */
4896   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4897     {
4898       HOST_WIDE_INT size;
4899       if (type)
4900         size = int_size_in_bytes (type);
4901       else
4902         /* No frontends can create types with variable-sized modes, so we
4903            shouldn't be asked to pass or return them.  */
4904         size = GET_MODE_SIZE (mode).to_constant ();
4905       if (size < 2 * UNITS_PER_WORD)
4906         return true;
4907     }
4908
4909   /* Otherwise, use the default padding.  */
4910   return !BYTES_BIG_ENDIAN;
4911 }
4912
4913 static scalar_int_mode
4914 aarch64_libgcc_cmp_return_mode (void)
4915 {
4916   return SImode;
4917 }
4918
4919 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4920
4921 /* We use the 12-bit shifted immediate arithmetic instructions so values
4922    must be multiple of (1 << 12), i.e. 4096.  */
4923 #define ARITH_FACTOR 4096
4924
4925 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4926 #error Cannot use simple address calculation for stack probing
4927 #endif
4928
4929 /* The pair of scratch registers used for stack probing.  */
4930 #define PROBE_STACK_FIRST_REG  R9_REGNUM
4931 #define PROBE_STACK_SECOND_REG R10_REGNUM
4932
4933 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4934    inclusive.  These are offsets from the current stack pointer.  */
4935
4936 static void
4937 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
4938 {
4939   HOST_WIDE_INT size;
4940   if (!poly_size.is_constant (&size))
4941     {
4942       sorry ("stack probes for SVE frames");
4943       return;
4944     }
4945
4946   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
4947
4948   /* See the same assertion on PROBE_INTERVAL above.  */
4949   gcc_assert ((first % ARITH_FACTOR) == 0);
4950
4951   /* See if we have a constant small number of probes to generate.  If so,
4952      that's the easy case.  */
4953   if (size <= PROBE_INTERVAL)
4954     {
4955       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
4956
4957       emit_set_insn (reg1,
4958                      plus_constant (Pmode,
4959                                     stack_pointer_rtx, -(first + base)));
4960       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
4961     }
4962
4963   /* The run-time loop is made up of 8 insns in the generic case while the
4964      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
4965   else if (size <= 4 * PROBE_INTERVAL)
4966     {
4967       HOST_WIDE_INT i, rem;
4968
4969       emit_set_insn (reg1,
4970                      plus_constant (Pmode,
4971                                     stack_pointer_rtx,
4972                                     -(first + PROBE_INTERVAL)));
4973       emit_stack_probe (reg1);
4974
4975       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
4976          it exceeds SIZE.  If only two probes are needed, this will not
4977          generate any code.  Then probe at FIRST + SIZE.  */
4978       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
4979         {
4980           emit_set_insn (reg1,
4981                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
4982           emit_stack_probe (reg1);
4983         }
4984
4985       rem = size - (i - PROBE_INTERVAL);
4986       if (rem > 256)
4987         {
4988           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4989
4990           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
4991           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
4992         }
4993       else
4994         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
4995     }
4996
4997   /* Otherwise, do the same as above, but in a loop.  Note that we must be
4998      extra careful with variables wrapping around because we might be at
4999      the very top (or the very bottom) of the address space and we have
5000      to be able to handle this case properly; in particular, we use an
5001      equality test for the loop condition.  */
5002   else
5003     {
5004       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
5005
5006       /* Step 1: round SIZE to the previous multiple of the interval.  */
5007
5008       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
5009
5010
5011       /* Step 2: compute initial and final value of the loop counter.  */
5012
5013       /* TEST_ADDR = SP + FIRST.  */
5014       emit_set_insn (reg1,
5015                      plus_constant (Pmode, stack_pointer_rtx, -first));
5016
5017       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
5018       HOST_WIDE_INT adjustment = - (first + rounded_size);
5019       if (! aarch64_uimm12_shift (adjustment))
5020         {
5021           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
5022                                           true, Pmode);
5023           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
5024         }
5025       else
5026         emit_set_insn (reg2,
5027                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
5028
5029       /* Step 3: the loop
5030
5031          do
5032            {
5033              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
5034              probe at TEST_ADDR
5035            }
5036          while (TEST_ADDR != LAST_ADDR)
5037
5038          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
5039          until it is equal to ROUNDED_SIZE.  */
5040
5041       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
5042
5043
5044       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
5045          that SIZE is equal to ROUNDED_SIZE.  */
5046
5047       if (size != rounded_size)
5048         {
5049           HOST_WIDE_INT rem = size - rounded_size;
5050
5051           if (rem > 256)
5052             {
5053               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5054
5055               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
5056               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
5057             }
5058           else
5059             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
5060         }
5061     }
5062
5063   /* Make sure nothing is scheduled before we are done.  */
5064   emit_insn (gen_blockage ());
5065 }
5066
5067 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
5068    absolute addresses.  */
5069
5070 const char *
5071 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
5072 {
5073   static int labelno = 0;
5074   char loop_lab[32];
5075   rtx xops[2];
5076
5077   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
5078
5079   /* Loop.  */
5080   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
5081
5082   HOST_WIDE_INT stack_clash_probe_interval
5083     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5084
5085   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
5086   xops[0] = reg1;
5087   HOST_WIDE_INT interval;
5088   if (flag_stack_clash_protection)
5089     interval = stack_clash_probe_interval;
5090   else
5091     interval = PROBE_INTERVAL;
5092
5093   gcc_assert (aarch64_uimm12_shift (interval));
5094   xops[1] = GEN_INT (interval);
5095
5096   output_asm_insn ("sub\t%0, %0, %1", xops);
5097
5098   /* If doing stack clash protection then we probe up by the ABI specified
5099      amount.  We do this because we're dropping full pages at a time in the
5100      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
5101   if (flag_stack_clash_protection)
5102     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
5103   else
5104     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
5105
5106   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
5107      by this amount for each iteration.  */
5108   output_asm_insn ("str\txzr, [%0, %1]", xops);
5109
5110   /* Test if TEST_ADDR == LAST_ADDR.  */
5111   xops[1] = reg2;
5112   output_asm_insn ("cmp\t%0, %1", xops);
5113
5114   /* Branch.  */
5115   fputs ("\tb.ne\t", asm_out_file);
5116   assemble_name_raw (asm_out_file, loop_lab);
5117   fputc ('\n', asm_out_file);
5118
5119   return "";
5120 }
5121
5122 /* Emit the probe loop for doing stack clash probes and stack adjustments for
5123    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
5124    of GUARD_SIZE.  When a probe is emitted it is done at most
5125    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
5126    at most MIN_PROBE_THRESHOLD.  By the end of this function
5127    BASE = BASE - ADJUSTMENT.  */
5128
5129 const char *
5130 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
5131                                       rtx min_probe_threshold, rtx guard_size)
5132 {
5133   /* This function is not allowed to use any instruction generation function
5134      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
5135      so instead emit the code you want using output_asm_insn.  */
5136   gcc_assert (flag_stack_clash_protection);
5137   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
5138   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
5139
5140   /* The minimum required allocation before the residual requires probing.  */
5141   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
5142
5143   /* Clamp the value down to the nearest value that can be used with a cmp.  */
5144   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
5145   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
5146
5147   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
5148   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
5149
5150   static int labelno = 0;
5151   char loop_start_lab[32];
5152   char loop_end_lab[32];
5153   rtx xops[2];
5154
5155   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
5156   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
5157
5158   /* Emit loop start label.  */
5159   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
5160
5161   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
5162   xops[0] = adjustment;
5163   xops[1] = probe_offset_value_rtx;
5164   output_asm_insn ("cmp\t%0, %1", xops);
5165
5166   /* Branch to end if not enough adjustment to probe.  */
5167   fputs ("\tb.lt\t", asm_out_file);
5168   assemble_name_raw (asm_out_file, loop_end_lab);
5169   fputc ('\n', asm_out_file);
5170
5171   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
5172   xops[0] = base;
5173   xops[1] = probe_offset_value_rtx;
5174   output_asm_insn ("sub\t%0, %0, %1", xops);
5175
5176   /* Probe at BASE.  */
5177   xops[1] = const0_rtx;
5178   output_asm_insn ("str\txzr, [%0, %1]", xops);
5179
5180   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
5181   xops[0] = adjustment;
5182   xops[1] = probe_offset_value_rtx;
5183   output_asm_insn ("sub\t%0, %0, %1", xops);
5184
5185   /* Branch to start if still more bytes to allocate.  */
5186   fputs ("\tb\t", asm_out_file);
5187   assemble_name_raw (asm_out_file, loop_start_lab);
5188   fputc ('\n', asm_out_file);
5189
5190   /* No probe leave.  */
5191   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
5192
5193   /* BASE = BASE - ADJUSTMENT.  */
5194   xops[0] = base;
5195   xops[1] = adjustment;
5196   output_asm_insn ("sub\t%0, %0, %1", xops);
5197   return "";
5198 }
5199
5200 /* Determine whether a frame chain needs to be generated.  */
5201 static bool
5202 aarch64_needs_frame_chain (void)
5203 {
5204   /* Force a frame chain for EH returns so the return address is at FP+8.  */
5205   if (frame_pointer_needed || crtl->calls_eh_return)
5206     return true;
5207
5208   /* A leaf function cannot have calls or write LR.  */
5209   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
5210
5211   /* Don't use a frame chain in leaf functions if leaf frame pointers
5212      are disabled.  */
5213   if (flag_omit_leaf_frame_pointer && is_leaf)
5214     return false;
5215
5216   return aarch64_use_frame_pointer;
5217 }
5218
5219 /* Mark the registers that need to be saved by the callee and calculate
5220    the size of the callee-saved registers area and frame record (both FP
5221    and LR may be omitted).  */
5222 static void
5223 aarch64_layout_frame (void)
5224 {
5225   HOST_WIDE_INT offset = 0;
5226   int regno, last_fp_reg = INVALID_REGNUM;
5227   bool simd_function = aarch64_simd_decl_p (cfun->decl);
5228
5229   cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
5230
5231   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
5232      the mid-end is doing.  */
5233   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
5234
5235 #define SLOT_NOT_REQUIRED (-2)
5236 #define SLOT_REQUIRED     (-1)
5237
5238   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
5239   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
5240
5241   /* If this is a non-leaf simd function with calls we assume that
5242      at least one of those calls is to a non-simd function and thus
5243      we must save V8 to V23 in the prologue.  */
5244
5245   if (simd_function && !crtl->is_leaf)
5246     {
5247       for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5248         if (FP_SIMD_SAVED_REGNUM_P (regno))
5249           df_set_regs_ever_live (regno, true);
5250     }
5251
5252   /* First mark all the registers that really need to be saved...  */
5253   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5254     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5255
5256   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5257     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5258
5259   /* ... that includes the eh data registers (if needed)...  */
5260   if (crtl->calls_eh_return)
5261     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
5262       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
5263         = SLOT_REQUIRED;
5264
5265   /* ... and any callee saved register that dataflow says is live.  */
5266   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5267     if (df_regs_ever_live_p (regno)
5268         && (regno == R30_REGNUM
5269             || !call_used_regs[regno]))
5270       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5271
5272   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5273     if (df_regs_ever_live_p (regno)
5274         && (!call_used_regs[regno]
5275             || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
5276       {
5277         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5278         last_fp_reg = regno;
5279       }
5280
5281   if (cfun->machine->frame.emit_frame_chain)
5282     {
5283       /* FP and LR are placed in the linkage record.  */
5284       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
5285       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
5286       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
5287       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
5288       offset = 2 * UNITS_PER_WORD;
5289     }
5290
5291   /* With stack-clash, LR must be saved in non-leaf functions.  */
5292   gcc_assert (crtl->is_leaf
5293               || (cfun->machine->frame.reg_offset[R30_REGNUM]
5294                   != SLOT_NOT_REQUIRED));
5295
5296   /* Now assign stack slots for them.  */
5297   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5298     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5299       {
5300         cfun->machine->frame.reg_offset[regno] = offset;
5301         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5302           cfun->machine->frame.wb_candidate1 = regno;
5303         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
5304           cfun->machine->frame.wb_candidate2 = regno;
5305         offset += UNITS_PER_WORD;
5306       }
5307
5308   HOST_WIDE_INT max_int_offset = offset;
5309   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5310   bool has_align_gap = offset != max_int_offset;
5311
5312   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5313     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5314       {
5315         /* If there is an alignment gap between integer and fp callee-saves,
5316            allocate the last fp register to it if possible.  */
5317         if (regno == last_fp_reg
5318             && has_align_gap
5319             && !simd_function
5320             && (offset & 8) == 0)
5321           {
5322             cfun->machine->frame.reg_offset[regno] = max_int_offset;
5323             break;
5324           }
5325
5326         cfun->machine->frame.reg_offset[regno] = offset;
5327         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5328           cfun->machine->frame.wb_candidate1 = regno;
5329         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
5330                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
5331           cfun->machine->frame.wb_candidate2 = regno;
5332         offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
5333       }
5334
5335   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5336
5337   cfun->machine->frame.saved_regs_size = offset;
5338
5339   HOST_WIDE_INT varargs_and_saved_regs_size
5340     = offset + cfun->machine->frame.saved_varargs_size;
5341
5342   cfun->machine->frame.hard_fp_offset
5343     = aligned_upper_bound (varargs_and_saved_regs_size
5344                            + get_frame_size (),
5345                            STACK_BOUNDARY / BITS_PER_UNIT);
5346
5347   /* Both these values are already aligned.  */
5348   gcc_assert (multiple_p (crtl->outgoing_args_size,
5349                           STACK_BOUNDARY / BITS_PER_UNIT));
5350   cfun->machine->frame.frame_size
5351     = (cfun->machine->frame.hard_fp_offset
5352        + crtl->outgoing_args_size);
5353
5354   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
5355
5356   cfun->machine->frame.initial_adjust = 0;
5357   cfun->machine->frame.final_adjust = 0;
5358   cfun->machine->frame.callee_adjust = 0;
5359   cfun->machine->frame.callee_offset = 0;
5360
5361   HOST_WIDE_INT max_push_offset = 0;
5362   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
5363     max_push_offset = 512;
5364   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
5365     max_push_offset = 256;
5366
5367   HOST_WIDE_INT const_size, const_fp_offset;
5368   if (cfun->machine->frame.frame_size.is_constant (&const_size)
5369       && const_size < max_push_offset
5370       && known_eq (crtl->outgoing_args_size, 0))
5371     {
5372       /* Simple, small frame with no outgoing arguments:
5373          stp reg1, reg2, [sp, -frame_size]!
5374          stp reg3, reg4, [sp, 16]  */
5375       cfun->machine->frame.callee_adjust = const_size;
5376     }
5377   else if (known_lt (crtl->outgoing_args_size
5378                      + cfun->machine->frame.saved_regs_size, 512)
5379            && !(cfun->calls_alloca
5380                 && known_lt (cfun->machine->frame.hard_fp_offset,
5381                              max_push_offset)))
5382     {
5383       /* Frame with small outgoing arguments:
5384          sub sp, sp, frame_size
5385          stp reg1, reg2, [sp, outgoing_args_size]
5386          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
5387       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
5388       cfun->machine->frame.callee_offset
5389         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
5390     }
5391   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
5392            && const_fp_offset < max_push_offset)
5393     {
5394       /* Frame with large outgoing arguments but a small local area:
5395          stp reg1, reg2, [sp, -hard_fp_offset]!
5396          stp reg3, reg4, [sp, 16]
5397          sub sp, sp, outgoing_args_size  */
5398       cfun->machine->frame.callee_adjust = const_fp_offset;
5399       cfun->machine->frame.final_adjust
5400         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
5401     }
5402   else
5403     {
5404       /* Frame with large local area and outgoing arguments using frame pointer:
5405          sub sp, sp, hard_fp_offset
5406          stp x29, x30, [sp, 0]
5407          add x29, sp, 0
5408          stp reg3, reg4, [sp, 16]
5409          sub sp, sp, outgoing_args_size  */
5410       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
5411       cfun->machine->frame.final_adjust
5412         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
5413     }
5414
5415   cfun->machine->frame.laid_out = true;
5416 }
5417
5418 /* Return true if the register REGNO is saved on entry to
5419    the current function.  */
5420
5421 static bool
5422 aarch64_register_saved_on_entry (int regno)
5423 {
5424   return cfun->machine->frame.reg_offset[regno] >= 0;
5425 }
5426
5427 /* Return the next register up from REGNO up to LIMIT for the callee
5428    to save.  */
5429
5430 static unsigned
5431 aarch64_next_callee_save (unsigned regno, unsigned limit)
5432 {
5433   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
5434     regno ++;
5435   return regno;
5436 }
5437
5438 /* Push the register number REGNO of mode MODE to the stack with write-back
5439    adjusting the stack by ADJUSTMENT.  */
5440
5441 static void
5442 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
5443                            HOST_WIDE_INT adjustment)
5444  {
5445   rtx base_rtx = stack_pointer_rtx;
5446   rtx insn, reg, mem;
5447
5448   reg = gen_rtx_REG (mode, regno);
5449   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
5450                             plus_constant (Pmode, base_rtx, -adjustment));
5451   mem = gen_frame_mem (mode, mem);
5452
5453   insn = emit_move_insn (mem, reg);
5454   RTX_FRAME_RELATED_P (insn) = 1;
5455 }
5456
5457 /* Generate and return an instruction to store the pair of registers
5458    REG and REG2 of mode MODE to location BASE with write-back adjusting
5459    the stack location BASE by ADJUSTMENT.  */
5460
5461 static rtx
5462 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5463                           HOST_WIDE_INT adjustment)
5464 {
5465   switch (mode)
5466     {
5467     case E_DImode:
5468       return gen_storewb_pairdi_di (base, base, reg, reg2,
5469                                     GEN_INT (-adjustment),
5470                                     GEN_INT (UNITS_PER_WORD - adjustment));
5471     case E_DFmode:
5472       return gen_storewb_pairdf_di (base, base, reg, reg2,
5473                                     GEN_INT (-adjustment),
5474                                     GEN_INT (UNITS_PER_WORD - adjustment));
5475     case E_TFmode:
5476       return gen_storewb_pairtf_di (base, base, reg, reg2,
5477                                     GEN_INT (-adjustment),
5478                                     GEN_INT (UNITS_PER_VREG - adjustment));
5479     default:
5480       gcc_unreachable ();
5481     }
5482 }
5483
5484 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
5485    stack pointer by ADJUSTMENT.  */
5486
5487 static void
5488 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
5489 {
5490   rtx_insn *insn;
5491   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5492
5493   if (regno2 == INVALID_REGNUM)
5494     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
5495
5496   rtx reg1 = gen_rtx_REG (mode, regno1);
5497   rtx reg2 = gen_rtx_REG (mode, regno2);
5498
5499   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
5500                                               reg2, adjustment));
5501   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
5502   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5503   RTX_FRAME_RELATED_P (insn) = 1;
5504 }
5505
5506 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
5507    adjusting it by ADJUSTMENT afterwards.  */
5508
5509 static rtx
5510 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5511                          HOST_WIDE_INT adjustment)
5512 {
5513   switch (mode)
5514     {
5515     case E_DImode:
5516       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
5517                                    GEN_INT (UNITS_PER_WORD));
5518     case E_DFmode:
5519       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
5520                                    GEN_INT (UNITS_PER_WORD));
5521     case E_TFmode:
5522       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
5523                                    GEN_INT (UNITS_PER_VREG));
5524     default:
5525       gcc_unreachable ();
5526     }
5527 }
5528
5529 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
5530    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
5531    into CFI_OPS.  */
5532
5533 static void
5534 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
5535                   rtx *cfi_ops)
5536 {
5537   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5538   rtx reg1 = gen_rtx_REG (mode, regno1);
5539
5540   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
5541
5542   if (regno2 == INVALID_REGNUM)
5543     {
5544       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
5545       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
5546       emit_move_insn (reg1, gen_frame_mem (mode, mem));
5547     }
5548   else
5549     {
5550       rtx reg2 = gen_rtx_REG (mode, regno2);
5551       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5552       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
5553                                           reg2, adjustment));
5554     }
5555 }
5556
5557 /* Generate and return a store pair instruction of mode MODE to store
5558    register REG1 to MEM1 and register REG2 to MEM2.  */
5559
5560 static rtx
5561 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
5562                         rtx reg2)
5563 {
5564   switch (mode)
5565     {
5566     case E_DImode:
5567       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
5568
5569     case E_DFmode:
5570       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
5571
5572     case E_TFmode:
5573       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
5574
5575     default:
5576       gcc_unreachable ();
5577     }
5578 }
5579
5580 /* Generate and regurn a load pair isntruction of mode MODE to load register
5581    REG1 from MEM1 and register REG2 from MEM2.  */
5582
5583 static rtx
5584 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
5585                        rtx mem2)
5586 {
5587   switch (mode)
5588     {
5589     case E_DImode:
5590       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
5591
5592     case E_DFmode:
5593       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
5594
5595     case E_TFmode:
5596       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
5597
5598     default:
5599       gcc_unreachable ();
5600     }
5601 }
5602
5603 /* Return TRUE if return address signing should be enabled for the current
5604    function, otherwise return FALSE.  */
5605
5606 bool
5607 aarch64_return_address_signing_enabled (void)
5608 {
5609   /* This function should only be called after frame laid out.   */
5610   gcc_assert (cfun->machine->frame.laid_out);
5611
5612   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
5613      if its LR is pushed onto stack.  */
5614   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
5615           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
5616               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
5617 }
5618
5619 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
5620 bool
5621 aarch64_bti_enabled (void)
5622 {
5623   return (aarch64_enable_bti == 1);
5624 }
5625
5626 /* Emit code to save the callee-saved registers from register number START
5627    to LIMIT to the stack at the location starting at offset START_OFFSET,
5628    skipping any write-back candidates if SKIP_WB is true.  */
5629
5630 static void
5631 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
5632                            unsigned start, unsigned limit, bool skip_wb)
5633 {
5634   rtx_insn *insn;
5635   unsigned regno;
5636   unsigned regno2;
5637
5638   for (regno = aarch64_next_callee_save (start, limit);
5639        regno <= limit;
5640        regno = aarch64_next_callee_save (regno + 1, limit))
5641     {
5642       rtx reg, mem;
5643       poly_int64 offset;
5644       int offset_diff;
5645
5646       if (skip_wb
5647           && (regno == cfun->machine->frame.wb_candidate1
5648               || regno == cfun->machine->frame.wb_candidate2))
5649         continue;
5650
5651       if (cfun->machine->reg_is_wrapped_separately[regno])
5652        continue;
5653
5654       reg = gen_rtx_REG (mode, regno);
5655       offset = start_offset + cfun->machine->frame.reg_offset[regno];
5656       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5657                                                 offset));
5658
5659       regno2 = aarch64_next_callee_save (regno + 1, limit);
5660       offset_diff = cfun->machine->frame.reg_offset[regno2]
5661                     - cfun->machine->frame.reg_offset[regno];
5662
5663       if (regno2 <= limit
5664           && !cfun->machine->reg_is_wrapped_separately[regno2]
5665           && known_eq (GET_MODE_SIZE (mode), offset_diff))
5666         {
5667           rtx reg2 = gen_rtx_REG (mode, regno2);
5668           rtx mem2;
5669
5670           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5671           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5672                                                      offset));
5673           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
5674                                                     reg2));
5675
5676           /* The first part of a frame-related parallel insn is
5677              always assumed to be relevant to the frame
5678              calculations; subsequent parts, are only
5679              frame-related if explicitly marked.  */
5680           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5681           regno = regno2;
5682         }
5683       else
5684         insn = emit_move_insn (mem, reg);
5685
5686       RTX_FRAME_RELATED_P (insn) = 1;
5687     }
5688 }
5689
5690 /* Emit code to restore the callee registers of mode MODE from register
5691    number START up to and including LIMIT.  Restore from the stack offset
5692    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
5693    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
5694
5695 static void
5696 aarch64_restore_callee_saves (machine_mode mode,
5697                               poly_int64 start_offset, unsigned start,
5698                               unsigned limit, bool skip_wb, rtx *cfi_ops)
5699 {
5700   rtx base_rtx = stack_pointer_rtx;
5701   unsigned regno;
5702   unsigned regno2;
5703   poly_int64 offset;
5704
5705   for (regno = aarch64_next_callee_save (start, limit);
5706        regno <= limit;
5707        regno = aarch64_next_callee_save (regno + 1, limit))
5708     {
5709       if (cfun->machine->reg_is_wrapped_separately[regno])
5710        continue;
5711
5712       rtx reg, mem;
5713       int offset_diff;
5714
5715       if (skip_wb
5716           && (regno == cfun->machine->frame.wb_candidate1
5717               || regno == cfun->machine->frame.wb_candidate2))
5718         continue;
5719
5720       reg = gen_rtx_REG (mode, regno);
5721       offset = start_offset + cfun->machine->frame.reg_offset[regno];
5722       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5723
5724       regno2 = aarch64_next_callee_save (regno + 1, limit);
5725       offset_diff = cfun->machine->frame.reg_offset[regno2]
5726                     - cfun->machine->frame.reg_offset[regno];
5727
5728       if (regno2 <= limit
5729           && !cfun->machine->reg_is_wrapped_separately[regno2]
5730           && known_eq (GET_MODE_SIZE (mode), offset_diff))
5731         {
5732           rtx reg2 = gen_rtx_REG (mode, regno2);
5733           rtx mem2;
5734
5735           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5736           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5737           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5738
5739           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5740           regno = regno2;
5741         }
5742       else
5743         emit_move_insn (reg, mem);
5744       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
5745     }
5746 }
5747
5748 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5749    of MODE.  */
5750
5751 static inline bool
5752 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5753 {
5754   HOST_WIDE_INT multiple;
5755   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5756           && IN_RANGE (multiple, -8, 7));
5757 }
5758
5759 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5760    of MODE.  */
5761
5762 static inline bool
5763 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5764 {
5765   HOST_WIDE_INT multiple;
5766   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5767           && IN_RANGE (multiple, 0, 63));
5768 }
5769
5770 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5771    of MODE.  */
5772
5773 bool
5774 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5775 {
5776   HOST_WIDE_INT multiple;
5777   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5778           && IN_RANGE (multiple, -64, 63));
5779 }
5780
5781 /* Return true if OFFSET is a signed 9-bit value.  */
5782
5783 bool
5784 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
5785                                        poly_int64 offset)
5786 {
5787   HOST_WIDE_INT const_offset;
5788   return (offset.is_constant (&const_offset)
5789           && IN_RANGE (const_offset, -256, 255));
5790 }
5791
5792 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5793    of MODE.  */
5794
5795 static inline bool
5796 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5797 {
5798   HOST_WIDE_INT multiple;
5799   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5800           && IN_RANGE (multiple, -256, 255));
5801 }
5802
5803 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5804    of MODE.  */
5805
5806 static inline bool
5807 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5808 {
5809   HOST_WIDE_INT multiple;
5810   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5811           && IN_RANGE (multiple, 0, 4095));
5812 }
5813
5814 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
5815
5816 static sbitmap
5817 aarch64_get_separate_components (void)
5818 {
5819   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5820   bitmap_clear (components);
5821
5822   /* The registers we need saved to the frame.  */
5823   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5824     if (aarch64_register_saved_on_entry (regno))
5825       {
5826         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5827         if (!frame_pointer_needed)
5828           offset += cfun->machine->frame.frame_size
5829                     - cfun->machine->frame.hard_fp_offset;
5830         /* Check that we can access the stack slot of the register with one
5831            direct load with no adjustments needed.  */
5832         if (offset_12bit_unsigned_scaled_p (DImode, offset))
5833           bitmap_set_bit (components, regno);
5834       }
5835
5836   /* Don't mess with the hard frame pointer.  */
5837   if (frame_pointer_needed)
5838     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5839
5840   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5841   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5842   /* If registers have been chosen to be stored/restored with
5843      writeback don't interfere with them to avoid having to output explicit
5844      stack adjustment instructions.  */
5845   if (reg2 != INVALID_REGNUM)
5846     bitmap_clear_bit (components, reg2);
5847   if (reg1 != INVALID_REGNUM)
5848     bitmap_clear_bit (components, reg1);
5849
5850   bitmap_clear_bit (components, LR_REGNUM);
5851   bitmap_clear_bit (components, SP_REGNUM);
5852
5853   return components;
5854 }
5855
5856 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
5857
5858 static sbitmap
5859 aarch64_components_for_bb (basic_block bb)
5860 {
5861   bitmap in = DF_LIVE_IN (bb);
5862   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5863   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5864   bool simd_function = aarch64_simd_decl_p (cfun->decl);
5865
5866   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5867   bitmap_clear (components);
5868
5869   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
5870   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5871     if ((!call_used_regs[regno]
5872         || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5873        && (bitmap_bit_p (in, regno)
5874            || bitmap_bit_p (gen, regno)
5875            || bitmap_bit_p (kill, regno)))
5876       {
5877         unsigned regno2, offset, offset2;
5878         bitmap_set_bit (components, regno);
5879
5880         /* If there is a callee-save at an adjacent offset, add it too
5881            to increase the use of LDP/STP.  */
5882         offset = cfun->machine->frame.reg_offset[regno];
5883         regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5884
5885         if (regno2 <= LAST_SAVED_REGNUM)
5886           {
5887             offset2 = cfun->machine->frame.reg_offset[regno2];
5888             if ((offset & ~8) == (offset2 & ~8))
5889               bitmap_set_bit (components, regno2);
5890           }
5891       }
5892
5893   return components;
5894 }
5895
5896 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5897    Nothing to do for aarch64.  */
5898
5899 static void
5900 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5901 {
5902 }
5903
5904 /* Return the next set bit in BMP from START onwards.  Return the total number
5905    of bits in BMP if no set bit is found at or after START.  */
5906
5907 static unsigned int
5908 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
5909 {
5910   unsigned int nbits = SBITMAP_SIZE (bmp);
5911   if (start == nbits)
5912     return start;
5913
5914   gcc_assert (start < nbits);
5915   for (unsigned int i = start; i < nbits; i++)
5916     if (bitmap_bit_p (bmp, i))
5917       return i;
5918
5919   return nbits;
5920 }
5921
5922 /* Do the work for aarch64_emit_prologue_components and
5923    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
5924    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5925    for these components or the epilogue sequence.  That is, it determines
5926    whether we should emit stores or loads and what kind of CFA notes to attach
5927    to the insns.  Otherwise the logic for the two sequences is very
5928    similar.  */
5929
5930 static void
5931 aarch64_process_components (sbitmap components, bool prologue_p)
5932 {
5933   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
5934                              ? HARD_FRAME_POINTER_REGNUM
5935                              : STACK_POINTER_REGNUM);
5936
5937   unsigned last_regno = SBITMAP_SIZE (components);
5938   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
5939   rtx_insn *insn = NULL;
5940
5941   while (regno != last_regno)
5942     {
5943       /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5944          so DFmode for the vector registers is enough.  For simd functions
5945          we want to save the low 128 bits.  */
5946       machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
5947
5948       rtx reg = gen_rtx_REG (mode, regno);
5949       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5950       if (!frame_pointer_needed)
5951         offset += cfun->machine->frame.frame_size
5952                   - cfun->machine->frame.hard_fp_offset;
5953       rtx addr = plus_constant (Pmode, ptr_reg, offset);
5954       rtx mem = gen_frame_mem (mode, addr);
5955
5956       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
5957       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
5958       /* No more registers to handle after REGNO.
5959          Emit a single save/restore and exit.  */
5960       if (regno2 == last_regno)
5961         {
5962           insn = emit_insn (set);
5963           RTX_FRAME_RELATED_P (insn) = 1;
5964           if (prologue_p)
5965             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5966           else
5967             add_reg_note (insn, REG_CFA_RESTORE, reg);
5968           break;
5969         }
5970
5971       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
5972       /* The next register is not of the same class or its offset is not
5973          mergeable with the current one into a pair.  */
5974       if (!satisfies_constraint_Ump (mem)
5975           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
5976           || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
5977           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
5978                        GET_MODE_SIZE (mode)))
5979         {
5980           insn = emit_insn (set);
5981           RTX_FRAME_RELATED_P (insn) = 1;
5982           if (prologue_p)
5983             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5984           else
5985             add_reg_note (insn, REG_CFA_RESTORE, reg);
5986
5987           regno = regno2;
5988           continue;
5989         }
5990
5991       /* REGNO2 can be saved/restored in a pair with REGNO.  */
5992       rtx reg2 = gen_rtx_REG (mode, regno2);
5993       if (!frame_pointer_needed)
5994         offset2 += cfun->machine->frame.frame_size
5995                   - cfun->machine->frame.hard_fp_offset;
5996       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
5997       rtx mem2 = gen_frame_mem (mode, addr2);
5998       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
5999                              : gen_rtx_SET (reg2, mem2);
6000
6001       if (prologue_p)
6002         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
6003       else
6004         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
6005
6006       RTX_FRAME_RELATED_P (insn) = 1;
6007       if (prologue_p)
6008         {
6009           add_reg_note (insn, REG_CFA_OFFSET, set);
6010           add_reg_note (insn, REG_CFA_OFFSET, set2);
6011         }
6012       else
6013         {
6014           add_reg_note (insn, REG_CFA_RESTORE, reg);
6015           add_reg_note (insn, REG_CFA_RESTORE, reg2);
6016         }
6017
6018       regno = aarch64_get_next_set_bit (components, regno2 + 1);
6019     }
6020 }
6021
6022 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
6023
6024 static void
6025 aarch64_emit_prologue_components (sbitmap components)
6026 {
6027   aarch64_process_components (components, true);
6028 }
6029
6030 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
6031
6032 static void
6033 aarch64_emit_epilogue_components (sbitmap components)
6034 {
6035   aarch64_process_components (components, false);
6036 }
6037
6038 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
6039
6040 static void
6041 aarch64_set_handled_components (sbitmap components)
6042 {
6043   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6044     if (bitmap_bit_p (components, regno))
6045       cfun->machine->reg_is_wrapped_separately[regno] = true;
6046 }
6047
6048 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
6049    determining the probe offset for alloca.  */
6050
6051 static HOST_WIDE_INT
6052 aarch64_stack_clash_protection_alloca_probe_range (void)
6053 {
6054   return STACK_CLASH_CALLER_GUARD;
6055 }
6056
6057
6058 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
6059    registers.  If POLY_SIZE is not large enough to require a probe this function
6060    will only adjust the stack.  When allocating the stack space
6061    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
6062    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
6063    arguments.  If we are then we ensure that any allocation larger than the ABI
6064    defined buffer needs a probe so that the invariant of having a 1KB buffer is
6065    maintained.
6066
6067    We emit barriers after each stack adjustment to prevent optimizations from
6068    breaking the invariant that we never drop the stack more than a page.  This
6069    invariant is needed to make it easier to correctly handle asynchronous
6070    events, e.g. if we were to allow the stack to be dropped by more than a page
6071    and then have multiple probes up and we take a signal somewhere in between
6072    then the signal handler doesn't know the state of the stack and can make no
6073    assumptions about which pages have been probed.  */
6074
6075 static void
6076 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
6077                                         poly_int64 poly_size,
6078                                         bool frame_related_p,
6079                                         bool final_adjustment_p)
6080 {
6081   HOST_WIDE_INT guard_size
6082     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6083   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6084   /* When doing the final adjustment for the outgoing argument size we can't
6085      assume that LR was saved at position 0.  So subtract it's offset from the
6086      ABI safe buffer so that we don't accidentally allow an adjustment that
6087      would result in an allocation larger than the ABI buffer without
6088      probing.  */
6089   HOST_WIDE_INT min_probe_threshold
6090     = final_adjustment_p
6091       ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
6092       : guard_size - guard_used_by_caller;
6093
6094   poly_int64 frame_size = cfun->machine->frame.frame_size;
6095
6096   /* We should always have a positive probe threshold.  */
6097   gcc_assert (min_probe_threshold > 0);
6098
6099   if (flag_stack_clash_protection && !final_adjustment_p)
6100     {
6101       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6102       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6103
6104       if (known_eq (frame_size, 0))
6105         {
6106           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
6107         }
6108       else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
6109                && known_lt (final_adjust, guard_used_by_caller))
6110         {
6111           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
6112         }
6113     }
6114
6115   /* If SIZE is not large enough to require probing, just adjust the stack and
6116      exit.  */
6117   if (known_lt (poly_size, min_probe_threshold)
6118       || !flag_stack_clash_protection)
6119     {
6120       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
6121       return;
6122     }
6123
6124   HOST_WIDE_INT size;
6125   /* Handle the SVE non-constant case first.  */
6126   if (!poly_size.is_constant (&size))
6127     {
6128      if (dump_file)
6129       {
6130         fprintf (dump_file, "Stack clash SVE prologue: ");
6131         print_dec (poly_size, dump_file);
6132         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
6133       }
6134
6135       /* First calculate the amount of bytes we're actually spilling.  */
6136       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
6137                           poly_size, temp1, temp2, false, true);
6138
6139       rtx_insn *insn = get_last_insn ();
6140
6141       if (frame_related_p)
6142         {
6143           /* This is done to provide unwinding information for the stack
6144              adjustments we're about to do, however to prevent the optimizers
6145              from removing the R11 move and leaving the CFA note (which would be
6146              very wrong) we tie the old and new stack pointer together.
6147              The tie will expand to nothing but the optimizers will not touch
6148              the instruction.  */
6149           rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6150           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
6151           emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
6152
6153           /* We want the CFA independent of the stack pointer for the
6154              duration of the loop.  */
6155           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
6156           RTX_FRAME_RELATED_P (insn) = 1;
6157         }
6158
6159       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
6160       rtx guard_const = gen_int_mode (guard_size, Pmode);
6161
6162       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
6163                                                    stack_pointer_rtx, temp1,
6164                                                    probe_const, guard_const));
6165
6166       /* Now reset the CFA register if needed.  */
6167       if (frame_related_p)
6168         {
6169           add_reg_note (insn, REG_CFA_DEF_CFA,
6170                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
6171                                       gen_int_mode (poly_size, Pmode)));
6172           RTX_FRAME_RELATED_P (insn) = 1;
6173         }
6174
6175       return;
6176     }
6177
6178   if (dump_file)
6179     fprintf (dump_file,
6180              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
6181              " bytes, probing will be required.\n", size);
6182
6183   /* Round size to the nearest multiple of guard_size, and calculate the
6184      residual as the difference between the original size and the rounded
6185      size.  */
6186   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
6187   HOST_WIDE_INT residual = size - rounded_size;
6188
6189   /* We can handle a small number of allocations/probes inline.  Otherwise
6190      punt to a loop.  */
6191   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
6192     {
6193       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
6194         {
6195           aarch64_sub_sp (NULL, temp2, guard_size, true);
6196           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6197                                            guard_used_by_caller));
6198           emit_insn (gen_blockage ());
6199         }
6200       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
6201     }
6202   else
6203     {
6204       /* Compute the ending address.  */
6205       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
6206                           temp1, NULL, false, true);
6207       rtx_insn *insn = get_last_insn ();
6208
6209       /* For the initial allocation, we don't have a frame pointer
6210          set up, so we always need CFI notes.  If we're doing the
6211          final allocation, then we may have a frame pointer, in which
6212          case it is the CFA, otherwise we need CFI notes.
6213
6214          We can determine which allocation we are doing by looking at
6215          the value of FRAME_RELATED_P since the final allocations are not
6216          frame related.  */
6217       if (frame_related_p)
6218         {
6219           /* We want the CFA independent of the stack pointer for the
6220              duration of the loop.  */
6221           add_reg_note (insn, REG_CFA_DEF_CFA,
6222                         plus_constant (Pmode, temp1, rounded_size));
6223           RTX_FRAME_RELATED_P (insn) = 1;
6224         }
6225
6226       /* This allocates and probes the stack.  Note that this re-uses some of
6227          the existing Ada stack protection code.  However we are guaranteed not
6228          to enter the non loop or residual branches of that code.
6229
6230          The non-loop part won't be entered because if our allocation amount
6231          doesn't require a loop, the case above would handle it.
6232
6233          The residual amount won't be entered because TEMP1 is a mutliple of
6234          the allocation size.  The residual will always be 0.  As such, the only
6235          part we are actually using from that code is the loop setup.  The
6236          actual probing is done in aarch64_output_probe_stack_range.  */
6237       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
6238                                                stack_pointer_rtx, temp1));
6239
6240       /* Now reset the CFA register if needed.  */
6241       if (frame_related_p)
6242         {
6243           add_reg_note (insn, REG_CFA_DEF_CFA,
6244                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
6245           RTX_FRAME_RELATED_P (insn) = 1;
6246         }
6247
6248       emit_insn (gen_blockage ());
6249       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
6250     }
6251
6252   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
6253      be probed.  This maintains the requirement that each page is probed at
6254      least once.  For initial probing we probe only if the allocation is
6255      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
6256      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
6257      GUARD_SIZE.  This works that for any allocation that is large enough to
6258      trigger a probe here, we'll have at least one, and if they're not large
6259      enough for this code to emit anything for them, The page would have been
6260      probed by the saving of FP/LR either by this function or any callees.  If
6261      we don't have any callees then we won't have more stack adjustments and so
6262      are still safe.  */
6263   if (residual)
6264     {
6265       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
6266       /* If we're doing final adjustments, and we've done any full page
6267          allocations then any residual needs to be probed.  */
6268       if (final_adjustment_p && rounded_size != 0)
6269         min_probe_threshold = 0;
6270       /* If doing a small final adjustment, we always probe at offset 0.
6271          This is done to avoid issues when LR is not at position 0 or when
6272          the final adjustment is smaller than the probing offset.  */
6273       else if (final_adjustment_p && rounded_size == 0)
6274         residual_probe_offset = 0;
6275
6276       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
6277       if (residual >= min_probe_threshold)
6278         {
6279           if (dump_file)
6280             fprintf (dump_file,
6281                      "Stack clash AArch64 prologue residuals: "
6282                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
6283                      "\n", residual);
6284
6285             emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6286                                              residual_probe_offset));
6287           emit_insn (gen_blockage ());
6288         }
6289     }
6290 }
6291
6292 /* Return 1 if the register is used by the epilogue.  We need to say the
6293    return register is used, but only after epilogue generation is complete.
6294    Note that in the case of sibcalls, the values "used by the epilogue" are
6295    considered live at the start of the called function.
6296
6297    For SIMD functions we need to return 1 for FP registers that are saved and
6298    restored by a function but are not zero in call_used_regs.  If we do not do
6299    this optimizations may remove the restore of the register.  */
6300
6301 int
6302 aarch64_epilogue_uses (int regno)
6303 {
6304   if (epilogue_completed)
6305     {
6306       if (regno == LR_REGNUM)
6307         return 1;
6308       if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
6309         return 1;
6310     }
6311   return 0;
6312 }
6313
6314 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6315    is saved at BASE + OFFSET.  */
6316
6317 static void
6318 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
6319                             rtx base, poly_int64 offset)
6320 {
6321   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
6322   add_reg_note (insn, REG_CFA_EXPRESSION,
6323                 gen_rtx_SET (mem, regno_reg_rtx[reg]));
6324 }
6325
6326 /* AArch64 stack frames generated by this compiler look like:
6327
6328         +-------------------------------+
6329         |                               |
6330         |  incoming stack arguments     |
6331         |                               |
6332         +-------------------------------+
6333         |                               | <-- incoming stack pointer (aligned)
6334         |  callee-allocated save area   |
6335         |  for register varargs         |
6336         |                               |
6337         +-------------------------------+
6338         |  local variables              | <-- frame_pointer_rtx
6339         |                               |
6340         +-------------------------------+
6341         |  padding                      | \
6342         +-------------------------------+  |
6343         |  callee-saved registers       |  | frame.saved_regs_size
6344         +-------------------------------+  |
6345         |  LR'                          |  |
6346         +-------------------------------+  |
6347         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
6348         +-------------------------------+
6349         |  dynamic allocation           |
6350         +-------------------------------+
6351         |  padding                      |
6352         +-------------------------------+
6353         |  outgoing stack arguments     | <-- arg_pointer
6354         |                               |
6355         +-------------------------------+
6356         |                               | <-- stack_pointer_rtx (aligned)
6357
6358    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
6359    but leave frame_pointer_rtx and hard_frame_pointer_rtx
6360    unchanged.
6361
6362    By default for stack-clash we assume the guard is at least 64KB, but this
6363    value is configurable to either 4KB or 64KB.  We also force the guard size to
6364    be the same as the probing interval and both values are kept in sync.
6365
6366    With those assumptions the callee can allocate up to 63KB (or 3KB depending
6367    on the guard size) of stack space without probing.
6368
6369    When probing is needed, we emit a probe at the start of the prologue
6370    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
6371
6372    We have to track how much space has been allocated and the only stores
6373    to the stack we track as implicit probes are the FP/LR stores.
6374
6375    For outgoing arguments we probe if the size is larger than 1KB, such that
6376    the ABI specified buffer is maintained for the next callee.
6377
6378    The following registers are reserved during frame layout and should not be
6379    used for any other purpose:
6380
6381    - r11: Used by stack clash protection when SVE is enabled.
6382    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
6383    - r14 and r15: Used for speculation tracking.
6384    - r16(IP0), r17(IP1): Used by indirect tailcalls.
6385    - r30(LR), r29(FP): Used by standard frame layout.
6386
6387    These registers must be avoided in frame layout related code unless the
6388    explicit intention is to interact with one of the features listed above.  */
6389
6390 /* Generate the prologue instructions for entry into a function.
6391    Establish the stack frame by decreasing the stack pointer with a
6392    properly calculated size and, if necessary, create a frame record
6393    filled with the values of LR and previous frame pointer.  The
6394    current FP is also set up if it is in use.  */
6395
6396 void
6397 aarch64_expand_prologue (void)
6398 {
6399   poly_int64 frame_size = cfun->machine->frame.frame_size;
6400   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6401   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6402   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6403   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6404   unsigned reg1 = cfun->machine->frame.wb_candidate1;
6405   unsigned reg2 = cfun->machine->frame.wb_candidate2;
6406   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
6407   rtx_insn *insn;
6408
6409   /* Sign return address for functions.  */
6410   if (aarch64_return_address_signing_enabled ())
6411     {
6412       switch (aarch64_ra_sign_key)
6413         {
6414           case AARCH64_KEY_A:
6415             insn = emit_insn (gen_paciasp ());
6416             break;
6417           case AARCH64_KEY_B:
6418             insn = emit_insn (gen_pacibsp ());
6419             break;
6420           default:
6421             gcc_unreachable ();
6422         }
6423       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6424       RTX_FRAME_RELATED_P (insn) = 1;
6425     }
6426
6427   if (flag_stack_usage_info)
6428     current_function_static_stack_size = constant_lower_bound (frame_size);
6429
6430   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
6431     {
6432       if (crtl->is_leaf && !cfun->calls_alloca)
6433         {
6434           if (maybe_gt (frame_size, PROBE_INTERVAL)
6435               && maybe_gt (frame_size, get_stack_check_protect ()))
6436             aarch64_emit_probe_stack_range (get_stack_check_protect (),
6437                                             (frame_size
6438                                              - get_stack_check_protect ()));
6439         }
6440       else if (maybe_gt (frame_size, 0))
6441         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
6442     }
6443
6444   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6445   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6446
6447   /* In theory we should never have both an initial adjustment
6448      and a callee save adjustment.  Verify that is the case since the
6449      code below does not handle it for -fstack-clash-protection.  */
6450   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
6451
6452   /* Will only probe if the initial adjustment is larger than the guard
6453      less the amount of the guard reserved for use by the caller's
6454      outgoing args.  */
6455   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
6456                                           true, false);
6457
6458   if (callee_adjust != 0)
6459     aarch64_push_regs (reg1, reg2, callee_adjust);
6460
6461   if (emit_frame_chain)
6462     {
6463       poly_int64 reg_offset = callee_adjust;
6464       if (callee_adjust == 0)
6465         {
6466           reg1 = R29_REGNUM;
6467           reg2 = R30_REGNUM;
6468           reg_offset = callee_offset;
6469           aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
6470         }
6471       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
6472                           stack_pointer_rtx, callee_offset,
6473                           tmp1_rtx, tmp0_rtx, frame_pointer_needed);
6474       if (frame_pointer_needed && !frame_size.is_constant ())
6475         {
6476           /* Variable-sized frames need to describe the save slot
6477              address using DW_CFA_expression rather than DW_CFA_offset.
6478              This means that, without taking further action, the
6479              locations of the registers that we've already saved would
6480              remain based on the stack pointer even after we redefine
6481              the CFA based on the frame pointer.  We therefore need new
6482              DW_CFA_expressions to re-express the save slots with addresses
6483              based on the frame pointer.  */
6484           rtx_insn *insn = get_last_insn ();
6485           gcc_assert (RTX_FRAME_RELATED_P (insn));
6486
6487           /* Add an explicit CFA definition if this was previously
6488              implicit.  */
6489           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
6490             {
6491               rtx src = plus_constant (Pmode, stack_pointer_rtx,
6492                                        callee_offset);
6493               add_reg_note (insn, REG_CFA_ADJUST_CFA,
6494                             gen_rtx_SET (hard_frame_pointer_rtx, src));
6495             }
6496
6497           /* Change the save slot expressions for the registers that
6498              we've already saved.  */
6499           reg_offset -= callee_offset;
6500           aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
6501                                       reg_offset + UNITS_PER_WORD);
6502           aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
6503                                       reg_offset);
6504         }
6505       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
6506     }
6507
6508   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6509                              callee_adjust != 0 || emit_frame_chain);
6510   if (aarch64_simd_decl_p (cfun->decl))
6511     aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6512                                callee_adjust != 0 || emit_frame_chain);
6513   else
6514     aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6515                                callee_adjust != 0 || emit_frame_chain);
6516
6517   /* We may need to probe the final adjustment if it is larger than the guard
6518      that is assumed by the called.  */
6519   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
6520                                           !frame_pointer_needed, true);
6521 }
6522
6523 /* Return TRUE if we can use a simple_return insn.
6524
6525    This function checks whether the callee saved stack is empty, which
6526    means no restore actions are need. The pro_and_epilogue will use
6527    this to check whether shrink-wrapping opt is feasible.  */
6528
6529 bool
6530 aarch64_use_return_insn_p (void)
6531 {
6532   if (!reload_completed)
6533     return false;
6534
6535   if (crtl->profile)
6536     return false;
6537
6538   return known_eq (cfun->machine->frame.frame_size, 0);
6539 }
6540
6541 /* Return false for non-leaf SIMD functions in order to avoid
6542    shrink-wrapping them.  Doing this will lose the necessary
6543    save/restore of FP registers.  */
6544
6545 bool
6546 aarch64_use_simple_return_insn_p (void)
6547 {
6548   if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
6549     return false;
6550
6551   return true;
6552 }
6553
6554 /* Generate the epilogue instructions for returning from a function.
6555    This is almost exactly the reverse of the prolog sequence, except
6556    that we need to insert barriers to avoid scheduling loads that read
6557    from a deallocated stack, and we optimize the unwind records by
6558    emitting them all together if possible.  */
6559 void
6560 aarch64_expand_epilogue (bool for_sibcall)
6561 {
6562   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6563   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6564   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6565   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6566   unsigned reg1 = cfun->machine->frame.wb_candidate1;
6567   unsigned reg2 = cfun->machine->frame.wb_candidate2;
6568   rtx cfi_ops = NULL;
6569   rtx_insn *insn;
6570   /* A stack clash protection prologue may not have left EP0_REGNUM or
6571      EP1_REGNUM in a usable state.  The same is true for allocations
6572      with an SVE component, since we then need both temporary registers
6573      for each allocation.  For stack clash we are in a usable state if
6574      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
6575   HOST_WIDE_INT guard_size
6576     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6577   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6578
6579   /* We can re-use the registers when the allocation amount is smaller than
6580      guard_size - guard_used_by_caller because we won't be doing any probes
6581      then.  In such situations the register should remain live with the correct
6582      value.  */
6583   bool can_inherit_p = (initial_adjust.is_constant ()
6584                         && final_adjust.is_constant ())
6585                         && (!flag_stack_clash_protection
6586                             || known_lt (initial_adjust,
6587                                          guard_size - guard_used_by_caller));
6588
6589   /* We need to add memory barrier to prevent read from deallocated stack.  */
6590   bool need_barrier_p
6591     = maybe_ne (get_frame_size ()
6592                 + cfun->machine->frame.saved_varargs_size, 0);
6593
6594   /* Emit a barrier to prevent loads from a deallocated stack.  */
6595   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
6596       || cfun->calls_alloca
6597       || crtl->calls_eh_return)
6598     {
6599       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6600       need_barrier_p = false;
6601     }
6602
6603   /* Restore the stack pointer from the frame pointer if it may not
6604      be the same as the stack pointer.  */
6605   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6606   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6607   if (frame_pointer_needed
6608       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
6609     /* If writeback is used when restoring callee-saves, the CFA
6610        is restored on the instruction doing the writeback.  */
6611     aarch64_add_offset (Pmode, stack_pointer_rtx,
6612                         hard_frame_pointer_rtx, -callee_offset,
6613                         tmp1_rtx, tmp0_rtx, callee_adjust == 0);
6614   else
6615      /* The case where we need to re-use the register here is very rare, so
6616         avoid the complicated condition and just always emit a move if the
6617         immediate doesn't fit.  */
6618      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
6619
6620   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6621                                 callee_adjust != 0, &cfi_ops);
6622   if (aarch64_simd_decl_p (cfun->decl))
6623     aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6624                                   callee_adjust != 0, &cfi_ops);
6625   else
6626     aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6627                                   callee_adjust != 0, &cfi_ops);
6628
6629   if (need_barrier_p)
6630     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6631
6632   if (callee_adjust != 0)
6633     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
6634
6635   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
6636     {
6637       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
6638       insn = get_last_insn ();
6639       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
6640       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
6641       RTX_FRAME_RELATED_P (insn) = 1;
6642       cfi_ops = NULL;
6643     }
6644
6645   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
6646      add restriction on emit_move optimization to leaf functions.  */
6647   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
6648                   (!can_inherit_p || !crtl->is_leaf
6649                    || df_regs_ever_live_p (EP0_REGNUM)));
6650
6651   if (cfi_ops)
6652     {
6653       /* Emit delayed restores and reset the CFA to be SP.  */
6654       insn = get_last_insn ();
6655       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
6656       REG_NOTES (insn) = cfi_ops;
6657       RTX_FRAME_RELATED_P (insn) = 1;
6658     }
6659
6660   /* We prefer to emit the combined return/authenticate instruction RETAA,
6661      however there are three cases in which we must instead emit an explicit
6662      authentication instruction.
6663
6664         1) Sibcalls don't return in a normal way, so if we're about to call one
6665            we must authenticate.
6666
6667         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
6668            generating code for !TARGET_ARMV8_3 we can't use it and must
6669            explicitly authenticate.
6670
6671         3) On an eh_return path we make extra stack adjustments to update the
6672            canonical frame address to be the exception handler's CFA.  We want
6673            to authenticate using the CFA of the function which calls eh_return.
6674     */
6675   if (aarch64_return_address_signing_enabled ()
6676       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
6677     {
6678       switch (aarch64_ra_sign_key)
6679         {
6680           case AARCH64_KEY_A:
6681             insn = emit_insn (gen_autiasp ());
6682             break;
6683           case AARCH64_KEY_B:
6684             insn = emit_insn (gen_autibsp ());
6685             break;
6686           default:
6687             gcc_unreachable ();
6688         }
6689       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6690       RTX_FRAME_RELATED_P (insn) = 1;
6691     }
6692
6693   /* Stack adjustment for exception handler.  */
6694   if (crtl->calls_eh_return && !for_sibcall)
6695     {
6696       /* We need to unwind the stack by the offset computed by
6697          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
6698          to be SP; letting the CFA move during this adjustment
6699          is just as correct as retaining the CFA from the body
6700          of the function.  Therefore, do nothing special.  */
6701       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
6702     }
6703
6704   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
6705   if (!for_sibcall)
6706     emit_jump_insn (ret_rtx);
6707 }
6708
6709 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
6710    normally or return to a previous frame after unwinding.
6711
6712    An EH return uses a single shared return sequence.  The epilogue is
6713    exactly like a normal epilogue except that it has an extra input
6714    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
6715    that must be applied after the frame has been destroyed.  An extra label
6716    is inserted before the epilogue which initializes this register to zero,
6717    and this is the entry point for a normal return.
6718
6719    An actual EH return updates the return address, initializes the stack
6720    adjustment and jumps directly into the epilogue (bypassing the zeroing
6721    of the adjustment).  Since the return address is typically saved on the
6722    stack when a function makes a call, the saved LR must be updated outside
6723    the epilogue.
6724
6725    This poses problems as the store is generated well before the epilogue,
6726    so the offset of LR is not known yet.  Also optimizations will remove the
6727    store as it appears dead, even after the epilogue is generated (as the
6728    base or offset for loading LR is different in many cases).
6729
6730    To avoid these problems this implementation forces the frame pointer
6731    in eh_return functions so that the location of LR is fixed and known early.
6732    It also marks the store volatile, so no optimization is permitted to
6733    remove the store.  */
6734 rtx
6735 aarch64_eh_return_handler_rtx (void)
6736 {
6737   rtx tmp = gen_frame_mem (Pmode,
6738     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
6739
6740   /* Mark the store volatile, so no optimization is permitted to remove it.  */
6741   MEM_VOLATILE_P (tmp) = true;
6742   return tmp;
6743 }
6744
6745 /* Output code to add DELTA to the first argument, and then jump
6746    to FUNCTION.  Used for C++ multiple inheritance.  */
6747 static void
6748 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
6749                          HOST_WIDE_INT delta,
6750                          HOST_WIDE_INT vcall_offset,
6751                          tree function)
6752 {
6753   /* The this pointer is always in x0.  Note that this differs from
6754      Arm where the this pointer maybe bumped to r1 if r0 is required
6755      to return a pointer to an aggregate.  On AArch64 a result value
6756      pointer will be in x8.  */
6757   int this_regno = R0_REGNUM;
6758   rtx this_rtx, temp0, temp1, addr, funexp;
6759   rtx_insn *insn;
6760   const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
6761
6762   if (aarch64_bti_enabled ())
6763     emit_insn (gen_bti_c());
6764
6765   reload_completed = 1;
6766   emit_note (NOTE_INSN_PROLOGUE_END);
6767
6768   this_rtx = gen_rtx_REG (Pmode, this_regno);
6769   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
6770   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
6771
6772   if (vcall_offset == 0)
6773     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
6774   else
6775     {
6776       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
6777
6778       addr = this_rtx;
6779       if (delta != 0)
6780         {
6781           if (delta >= -256 && delta < 256)
6782             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
6783                                        plus_constant (Pmode, this_rtx, delta));
6784           else
6785             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
6786                                 temp1, temp0, false);
6787         }
6788
6789       if (Pmode == ptr_mode)
6790         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
6791       else
6792         aarch64_emit_move (temp0,
6793                            gen_rtx_ZERO_EXTEND (Pmode,
6794                                                 gen_rtx_MEM (ptr_mode, addr)));
6795
6796       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
6797           addr = plus_constant (Pmode, temp0, vcall_offset);
6798       else
6799         {
6800           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
6801                                           Pmode);
6802           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
6803         }
6804
6805       if (Pmode == ptr_mode)
6806         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
6807       else
6808         aarch64_emit_move (temp1,
6809                            gen_rtx_SIGN_EXTEND (Pmode,
6810                                                 gen_rtx_MEM (ptr_mode, addr)));
6811
6812       emit_insn (gen_add2_insn (this_rtx, temp1));
6813     }
6814
6815   /* Generate a tail call to the target function.  */
6816   if (!TREE_USED (function))
6817     {
6818       assemble_external (function);
6819       TREE_USED (function) = 1;
6820     }
6821   funexp = XEXP (DECL_RTL (function), 0);
6822   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6823   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
6824   SIBLING_CALL_P (insn) = 1;
6825
6826   insn = get_insns ();
6827   shorten_branches (insn);
6828
6829   assemble_start_function (thunk, fnname);
6830   final_start_function (insn, file, 1);
6831   final (insn, file, 1);
6832   final_end_function ();
6833   assemble_end_function (thunk, fnname);
6834
6835   /* Stop pretending to be a post-reload pass.  */
6836   reload_completed = 0;
6837 }
6838
6839 static bool
6840 aarch64_tls_referenced_p (rtx x)
6841 {
6842   if (!TARGET_HAVE_TLS)
6843     return false;
6844   subrtx_iterator::array_type array;
6845   FOR_EACH_SUBRTX (iter, array, x, ALL)
6846     {
6847       const_rtx x = *iter;
6848       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6849         return true;
6850       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6851          TLS offsets, not real symbol references.  */
6852       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6853         iter.skip_subrtxes ();
6854     }
6855   return false;
6856 }
6857
6858
6859 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6860    a left shift of 0 or 12 bits.  */
6861 bool
6862 aarch64_uimm12_shift (HOST_WIDE_INT val)
6863 {
6864   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6865           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6866           );
6867 }
6868
6869 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6870    that can be created with a left shift of 0 or 12.  */
6871 static HOST_WIDE_INT
6872 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6873 {
6874   /* Check to see if the value fits in 24 bits, as that is the maximum we can
6875      handle correctly.  */
6876   gcc_assert ((val & 0xffffff) == val);
6877
6878   if (((val & 0xfff) << 0) == val)
6879     return val;
6880
6881   return val & (0xfff << 12);
6882 }
6883
6884 /* Return true if val is an immediate that can be loaded into a
6885    register by a MOVZ instruction.  */
6886 static bool
6887 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6888 {
6889   if (GET_MODE_SIZE (mode) > 4)
6890     {
6891       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6892           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6893         return 1;
6894     }
6895   else
6896     {
6897       /* Ignore sign extension.  */
6898       val &= (HOST_WIDE_INT) 0xffffffff;
6899     }
6900   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6901           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6902 }
6903
6904 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
6905    64-bit (DImode) integer.  */
6906
6907 static unsigned HOST_WIDE_INT
6908 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
6909 {
6910   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
6911   while (size < 64)
6912     {
6913       val &= (HOST_WIDE_INT_1U << size) - 1;
6914       val |= val << size;
6915       size *= 2;
6916     }
6917   return val;
6918 }
6919
6920 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
6921
6922 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
6923   {
6924     0x0000000100000001ull,
6925     0x0001000100010001ull,
6926     0x0101010101010101ull,
6927     0x1111111111111111ull,
6928     0x5555555555555555ull,
6929   };
6930
6931
6932 /* Return true if val is a valid bitmask immediate.  */
6933
6934 bool
6935 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
6936 {
6937   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
6938   int bits;
6939
6940   /* Check for a single sequence of one bits and return quickly if so.
6941      The special cases of all ones and all zeroes returns false.  */
6942   val = aarch64_replicate_bitmask_imm (val_in, mode);
6943   tmp = val + (val & -val);
6944
6945   if (tmp == (tmp & -tmp))
6946     return (val + 1) > 1;
6947
6948   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
6949   if (mode == SImode)
6950     val = (val << 32) | (val & 0xffffffff);
6951
6952   /* Invert if the immediate doesn't start with a zero bit - this means we
6953      only need to search for sequences of one bits.  */
6954   if (val & 1)
6955     val = ~val;
6956
6957   /* Find the first set bit and set tmp to val with the first sequence of one
6958      bits removed.  Return success if there is a single sequence of ones.  */
6959   first_one = val & -val;
6960   tmp = val & (val + first_one);
6961
6962   if (tmp == 0)
6963     return true;
6964
6965   /* Find the next set bit and compute the difference in bit position.  */
6966   next_one = tmp & -tmp;
6967   bits = clz_hwi (first_one) - clz_hwi (next_one);
6968   mask = val ^ tmp;
6969
6970   /* Check the bit position difference is a power of 2, and that the first
6971      sequence of one bits fits within 'bits' bits.  */
6972   if ((mask >> bits) != 0 || bits != (bits & -bits))
6973     return false;
6974
6975   /* Check the sequence of one bits is repeated 64/bits times.  */
6976   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
6977 }
6978
6979 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
6980    Assumed precondition: VAL_IN Is not zero.  */
6981
6982 unsigned HOST_WIDE_INT
6983 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
6984 {
6985   int lowest_bit_set = ctz_hwi (val_in);
6986   int highest_bit_set = floor_log2 (val_in);
6987   gcc_assert (val_in != 0);
6988
6989   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
6990           (HOST_WIDE_INT_1U << lowest_bit_set));
6991 }
6992
6993 /* Create constant where bits outside of lowest bit set to highest bit set
6994    are set to 1.  */
6995
6996 unsigned HOST_WIDE_INT
6997 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
6998 {
6999   return val_in | ~aarch64_and_split_imm1 (val_in);
7000 }
7001
7002 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
7003
7004 bool
7005 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
7006 {
7007   scalar_int_mode int_mode;
7008   if (!is_a <scalar_int_mode> (mode, &int_mode))
7009     return false;
7010
7011   if (aarch64_bitmask_imm (val_in, int_mode))
7012     return false;
7013
7014   if (aarch64_move_imm (val_in, int_mode))
7015     return false;
7016
7017   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
7018
7019   return aarch64_bitmask_imm (imm2, int_mode);
7020 }
7021
7022 /* Return true if val is an immediate that can be loaded into a
7023    register in a single instruction.  */
7024 bool
7025 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
7026 {
7027   scalar_int_mode int_mode;
7028   if (!is_a <scalar_int_mode> (mode, &int_mode))
7029     return false;
7030
7031   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
7032     return 1;
7033   return aarch64_bitmask_imm (val, int_mode);
7034 }
7035
7036 static bool
7037 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
7038 {
7039   rtx base, offset;
7040
7041   if (GET_CODE (x) == HIGH)
7042     return true;
7043
7044   /* There's no way to calculate VL-based values using relocations.  */
7045   subrtx_iterator::array_type array;
7046   FOR_EACH_SUBRTX (iter, array, x, ALL)
7047     if (GET_CODE (*iter) == CONST_POLY_INT)
7048       return true;
7049
7050   split_const (x, &base, &offset);
7051   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
7052     {
7053       if (aarch64_classify_symbol (base, INTVAL (offset))
7054           != SYMBOL_FORCE_TO_MEM)
7055         return true;
7056       else
7057         /* Avoid generating a 64-bit relocation in ILP32; leave
7058            to aarch64_expand_mov_immediate to handle it properly.  */
7059         return mode != ptr_mode;
7060     }
7061
7062   return aarch64_tls_referenced_p (x);
7063 }
7064
7065 /* Implement TARGET_CASE_VALUES_THRESHOLD.
7066    The expansion for a table switch is quite expensive due to the number
7067    of instructions, the table lookup and hard to predict indirect jump.
7068    When optimizing for speed, and -O3 enabled, use the per-core tuning if
7069    set, otherwise use tables for > 16 cases as a tradeoff between size and
7070    performance.  When optimizing for size, use the default setting.  */
7071
7072 static unsigned int
7073 aarch64_case_values_threshold (void)
7074 {
7075   /* Use the specified limit for the number of cases before using jump
7076      tables at higher optimization levels.  */
7077   if (optimize > 2
7078       && selected_cpu->tune->max_case_values != 0)
7079     return selected_cpu->tune->max_case_values;
7080   else
7081     return optimize_size ? default_case_values_threshold () : 17;
7082 }
7083
7084 /* Return true if register REGNO is a valid index register.
7085    STRICT_P is true if REG_OK_STRICT is in effect.  */
7086
7087 bool
7088 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
7089 {
7090   if (!HARD_REGISTER_NUM_P (regno))
7091     {
7092       if (!strict_p)
7093         return true;
7094
7095       if (!reg_renumber)
7096         return false;
7097
7098       regno = reg_renumber[regno];
7099     }
7100   return GP_REGNUM_P (regno);
7101 }
7102
7103 /* Return true if register REGNO is a valid base register for mode MODE.
7104    STRICT_P is true if REG_OK_STRICT is in effect.  */
7105
7106 bool
7107 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
7108 {
7109   if (!HARD_REGISTER_NUM_P (regno))
7110     {
7111       if (!strict_p)
7112         return true;
7113
7114       if (!reg_renumber)
7115         return false;
7116
7117       regno = reg_renumber[regno];
7118     }
7119
7120   /* The fake registers will be eliminated to either the stack or
7121      hard frame pointer, both of which are usually valid base registers.
7122      Reload deals with the cases where the eliminated form isn't valid.  */
7123   return (GP_REGNUM_P (regno)
7124           || regno == SP_REGNUM
7125           || regno == FRAME_POINTER_REGNUM
7126           || regno == ARG_POINTER_REGNUM);
7127 }
7128
7129 /* Return true if X is a valid base register for mode MODE.
7130    STRICT_P is true if REG_OK_STRICT is in effect.  */
7131
7132 static bool
7133 aarch64_base_register_rtx_p (rtx x, bool strict_p)
7134 {
7135   if (!strict_p
7136       && GET_CODE (x) == SUBREG
7137       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
7138     x = SUBREG_REG (x);
7139
7140   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
7141 }
7142
7143 /* Return true if address offset is a valid index.  If it is, fill in INFO
7144    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
7145
7146 static bool
7147 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
7148                         machine_mode mode, bool strict_p)
7149 {
7150   enum aarch64_address_type type;
7151   rtx index;
7152   int shift;
7153
7154   /* (reg:P) */
7155   if ((REG_P (x) || GET_CODE (x) == SUBREG)
7156       && GET_MODE (x) == Pmode)
7157     {
7158       type = ADDRESS_REG_REG;
7159       index = x;
7160       shift = 0;
7161     }
7162   /* (sign_extend:DI (reg:SI)) */
7163   else if ((GET_CODE (x) == SIGN_EXTEND
7164             || GET_CODE (x) == ZERO_EXTEND)
7165            && GET_MODE (x) == DImode
7166            && GET_MODE (XEXP (x, 0)) == SImode)
7167     {
7168       type = (GET_CODE (x) == SIGN_EXTEND)
7169         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7170       index = XEXP (x, 0);
7171       shift = 0;
7172     }
7173   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
7174   else if (GET_CODE (x) == MULT
7175            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7176                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7177            && GET_MODE (XEXP (x, 0)) == DImode
7178            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7179            && CONST_INT_P (XEXP (x, 1)))
7180     {
7181       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7182         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7183       index = XEXP (XEXP (x, 0), 0);
7184       shift = exact_log2 (INTVAL (XEXP (x, 1)));
7185     }
7186   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
7187   else if (GET_CODE (x) == ASHIFT
7188            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7189                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7190            && GET_MODE (XEXP (x, 0)) == DImode
7191            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7192            && CONST_INT_P (XEXP (x, 1)))
7193     {
7194       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7195         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7196       index = XEXP (XEXP (x, 0), 0);
7197       shift = INTVAL (XEXP (x, 1));
7198     }
7199   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
7200   else if ((GET_CODE (x) == SIGN_EXTRACT
7201             || GET_CODE (x) == ZERO_EXTRACT)
7202            && GET_MODE (x) == DImode
7203            && GET_CODE (XEXP (x, 0)) == MULT
7204            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7205            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7206     {
7207       type = (GET_CODE (x) == SIGN_EXTRACT)
7208         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7209       index = XEXP (XEXP (x, 0), 0);
7210       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7211       if (INTVAL (XEXP (x, 1)) != 32 + shift
7212           || INTVAL (XEXP (x, 2)) != 0)
7213         shift = -1;
7214     }
7215   /* (and:DI (mult:DI (reg:DI) (const_int scale))
7216      (const_int 0xffffffff<<shift)) */
7217   else if (GET_CODE (x) == AND
7218            && GET_MODE (x) == DImode
7219            && GET_CODE (XEXP (x, 0)) == MULT
7220            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7221            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7222            && CONST_INT_P (XEXP (x, 1)))
7223     {
7224       type = ADDRESS_REG_UXTW;
7225       index = XEXP (XEXP (x, 0), 0);
7226       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7227       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7228         shift = -1;
7229     }
7230   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
7231   else if ((GET_CODE (x) == SIGN_EXTRACT
7232             || GET_CODE (x) == ZERO_EXTRACT)
7233            && GET_MODE (x) == DImode
7234            && GET_CODE (XEXP (x, 0)) == ASHIFT
7235            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7236            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7237     {
7238       type = (GET_CODE (x) == SIGN_EXTRACT)
7239         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7240       index = XEXP (XEXP (x, 0), 0);
7241       shift = INTVAL (XEXP (XEXP (x, 0), 1));
7242       if (INTVAL (XEXP (x, 1)) != 32 + shift
7243           || INTVAL (XEXP (x, 2)) != 0)
7244         shift = -1;
7245     }
7246   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
7247      (const_int 0xffffffff<<shift)) */
7248   else if (GET_CODE (x) == AND
7249            && GET_MODE (x) == DImode
7250            && GET_CODE (XEXP (x, 0)) == ASHIFT
7251            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7252            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7253            && CONST_INT_P (XEXP (x, 1)))
7254     {
7255       type = ADDRESS_REG_UXTW;
7256       index = XEXP (XEXP (x, 0), 0);
7257       shift = INTVAL (XEXP (XEXP (x, 0), 1));
7258       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7259         shift = -1;
7260     }
7261   /* (mult:P (reg:P) (const_int scale)) */
7262   else if (GET_CODE (x) == MULT
7263            && GET_MODE (x) == Pmode
7264            && GET_MODE (XEXP (x, 0)) == Pmode
7265            && CONST_INT_P (XEXP (x, 1)))
7266     {
7267       type = ADDRESS_REG_REG;
7268       index = XEXP (x, 0);
7269       shift = exact_log2 (INTVAL (XEXP (x, 1)));
7270     }
7271   /* (ashift:P (reg:P) (const_int shift)) */
7272   else if (GET_CODE (x) == ASHIFT
7273            && GET_MODE (x) == Pmode
7274            && GET_MODE (XEXP (x, 0)) == Pmode
7275            && CONST_INT_P (XEXP (x, 1)))
7276     {
7277       type = ADDRESS_REG_REG;
7278       index = XEXP (x, 0);
7279       shift = INTVAL (XEXP (x, 1));
7280     }
7281   else
7282     return false;
7283
7284   if (!strict_p
7285       && GET_CODE (index) == SUBREG
7286       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
7287     index = SUBREG_REG (index);
7288
7289   if (aarch64_sve_data_mode_p (mode))
7290     {
7291       if (type != ADDRESS_REG_REG
7292           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
7293         return false;
7294     }
7295   else
7296     {
7297       if (shift != 0
7298           && !(IN_RANGE (shift, 1, 3)
7299                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
7300         return false;
7301     }
7302
7303   if (REG_P (index)
7304       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
7305     {
7306       info->type = type;
7307       info->offset = index;
7308       info->shift = shift;
7309       return true;
7310     }
7311
7312   return false;
7313 }
7314
7315 /* Return true if MODE is one of the modes for which we
7316    support LDP/STP operations.  */
7317
7318 static bool
7319 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
7320 {
7321   return mode == SImode || mode == DImode
7322          || mode == SFmode || mode == DFmode
7323          || (aarch64_vector_mode_supported_p (mode)
7324              && (known_eq (GET_MODE_SIZE (mode), 8)
7325                  || (known_eq (GET_MODE_SIZE (mode), 16)
7326                     && (aarch64_tune_params.extra_tuning_flags
7327                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
7328 }
7329
7330 /* Return true if REGNO is a virtual pointer register, or an eliminable
7331    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
7332    include stack_pointer or hard_frame_pointer.  */
7333 static bool
7334 virt_or_elim_regno_p (unsigned regno)
7335 {
7336   return ((regno >= FIRST_VIRTUAL_REGISTER
7337            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
7338           || regno == FRAME_POINTER_REGNUM
7339           || regno == ARG_POINTER_REGNUM);
7340 }
7341
7342 /* Return true if X is a valid address of type TYPE for machine mode MODE.
7343    If it is, fill in INFO appropriately.  STRICT_P is true if
7344    REG_OK_STRICT is in effect.  */
7345
7346 bool
7347 aarch64_classify_address (struct aarch64_address_info *info,
7348                           rtx x, machine_mode mode, bool strict_p,
7349                           aarch64_addr_query_type type)
7350 {
7351   enum rtx_code code = GET_CODE (x);
7352   rtx op0, op1;
7353   poly_int64 offset;
7354
7355   HOST_WIDE_INT const_size;
7356
7357   /* On BE, we use load/store pair for all large int mode load/stores.
7358      TI/TFmode may also use a load/store pair.  */
7359   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7360   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
7361   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
7362                             || type == ADDR_QUERY_LDP_STP_N
7363                             || mode == TImode
7364                             || mode == TFmode
7365                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
7366
7367   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
7368      corresponds to the actual size of the memory being loaded/stored and the
7369      mode of the corresponding addressing mode is half of that.  */
7370   if (type == ADDR_QUERY_LDP_STP_N
7371       && known_eq (GET_MODE_SIZE (mode), 16))
7372     mode = DFmode;
7373
7374   bool allow_reg_index_p = (!load_store_pair_p
7375                             && (known_lt (GET_MODE_SIZE (mode), 16)
7376                                 || vec_flags == VEC_ADVSIMD
7377                                 || vec_flags & VEC_SVE_DATA));
7378
7379   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
7380      [Rn, #offset, MUL VL].  */
7381   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
7382       && (code != REG && code != PLUS))
7383     return false;
7384
7385   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
7386      REG addressing.  */
7387   if (advsimd_struct_p
7388       && !BYTES_BIG_ENDIAN
7389       && (code != POST_INC && code != REG))
7390     return false;
7391
7392   gcc_checking_assert (GET_MODE (x) == VOIDmode
7393                        || SCALAR_INT_MODE_P (GET_MODE (x)));
7394
7395   switch (code)
7396     {
7397     case REG:
7398     case SUBREG:
7399       info->type = ADDRESS_REG_IMM;
7400       info->base = x;
7401       info->offset = const0_rtx;
7402       info->const_offset = 0;
7403       return aarch64_base_register_rtx_p (x, strict_p);
7404
7405     case PLUS:
7406       op0 = XEXP (x, 0);
7407       op1 = XEXP (x, 1);
7408
7409       if (! strict_p
7410           && REG_P (op0)
7411           && virt_or_elim_regno_p (REGNO (op0))
7412           && poly_int_rtx_p (op1, &offset))
7413         {
7414           info->type = ADDRESS_REG_IMM;
7415           info->base = op0;
7416           info->offset = op1;
7417           info->const_offset = offset;
7418
7419           return true;
7420         }
7421
7422       if (maybe_ne (GET_MODE_SIZE (mode), 0)
7423           && aarch64_base_register_rtx_p (op0, strict_p)
7424           && poly_int_rtx_p (op1, &offset))
7425         {
7426           info->type = ADDRESS_REG_IMM;
7427           info->base = op0;
7428           info->offset = op1;
7429           info->const_offset = offset;
7430
7431           /* TImode and TFmode values are allowed in both pairs of X
7432              registers and individual Q registers.  The available
7433              address modes are:
7434              X,X: 7-bit signed scaled offset
7435              Q:   9-bit signed offset
7436              We conservatively require an offset representable in either mode.
7437              When performing the check for pairs of X registers i.e.  LDP/STP
7438              pass down DImode since that is the natural size of the LDP/STP
7439              instruction memory accesses.  */
7440           if (mode == TImode || mode == TFmode)
7441             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
7442                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7443                         || offset_12bit_unsigned_scaled_p (mode, offset)));
7444
7445           /* A 7bit offset check because OImode will emit a ldp/stp
7446              instruction (only big endian will get here).
7447              For ldp/stp instructions, the offset is scaled for the size of a
7448              single element of the pair.  */
7449           if (mode == OImode)
7450             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
7451
7452           /* Three 9/12 bit offsets checks because CImode will emit three
7453              ldr/str instructions (only big endian will get here).  */
7454           if (mode == CImode)
7455             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7456                     && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
7457                                                                offset + 32)
7458                         || offset_12bit_unsigned_scaled_p (V16QImode,
7459                                                            offset + 32)));
7460
7461           /* Two 7bit offsets checks because XImode will emit two ldp/stp
7462              instructions (only big endian will get here).  */
7463           if (mode == XImode)
7464             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7465                     && aarch64_offset_7bit_signed_scaled_p (TImode,
7466                                                             offset + 32));
7467
7468           /* Make "m" use the LD1 offset range for SVE data modes, so
7469              that pre-RTL optimizers like ivopts will work to that
7470              instead of the wider LDR/STR range.  */
7471           if (vec_flags == VEC_SVE_DATA)
7472             return (type == ADDR_QUERY_M
7473                     ? offset_4bit_signed_scaled_p (mode, offset)
7474                     : offset_9bit_signed_scaled_p (mode, offset));
7475
7476           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
7477             {
7478               poly_int64 end_offset = (offset
7479                                        + GET_MODE_SIZE (mode)
7480                                        - BYTES_PER_SVE_VECTOR);
7481               return (type == ADDR_QUERY_M
7482                       ? offset_4bit_signed_scaled_p (mode, offset)
7483                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
7484                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
7485                                                          end_offset)));
7486             }
7487
7488           if (vec_flags == VEC_SVE_PRED)
7489             return offset_9bit_signed_scaled_p (mode, offset);
7490
7491           if (load_store_pair_p)
7492             return ((known_eq (GET_MODE_SIZE (mode), 4)
7493                      || known_eq (GET_MODE_SIZE (mode), 8)
7494                      || known_eq (GET_MODE_SIZE (mode), 16))
7495                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7496           else
7497             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7498                     || offset_12bit_unsigned_scaled_p (mode, offset));
7499         }
7500
7501       if (allow_reg_index_p)
7502         {
7503           /* Look for base + (scaled/extended) index register.  */
7504           if (aarch64_base_register_rtx_p (op0, strict_p)
7505               && aarch64_classify_index (info, op1, mode, strict_p))
7506             {
7507               info->base = op0;
7508               return true;
7509             }
7510           if (aarch64_base_register_rtx_p (op1, strict_p)
7511               && aarch64_classify_index (info, op0, mode, strict_p))
7512             {
7513               info->base = op1;
7514               return true;
7515             }
7516         }
7517
7518       return false;
7519
7520     case POST_INC:
7521     case POST_DEC:
7522     case PRE_INC:
7523     case PRE_DEC:
7524       info->type = ADDRESS_REG_WB;
7525       info->base = XEXP (x, 0);
7526       info->offset = NULL_RTX;
7527       return aarch64_base_register_rtx_p (info->base, strict_p);
7528
7529     case POST_MODIFY:
7530     case PRE_MODIFY:
7531       info->type = ADDRESS_REG_WB;
7532       info->base = XEXP (x, 0);
7533       if (GET_CODE (XEXP (x, 1)) == PLUS
7534           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
7535           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
7536           && aarch64_base_register_rtx_p (info->base, strict_p))
7537         {
7538           info->offset = XEXP (XEXP (x, 1), 1);
7539           info->const_offset = offset;
7540
7541           /* TImode and TFmode values are allowed in both pairs of X
7542              registers and individual Q registers.  The available
7543              address modes are:
7544              X,X: 7-bit signed scaled offset
7545              Q:   9-bit signed offset
7546              We conservatively require an offset representable in either mode.
7547            */
7548           if (mode == TImode || mode == TFmode)
7549             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
7550                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
7551
7552           if (load_store_pair_p)
7553             return ((known_eq (GET_MODE_SIZE (mode), 4)
7554                      || known_eq (GET_MODE_SIZE (mode), 8)
7555                      || known_eq (GET_MODE_SIZE (mode), 16))
7556                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7557           else
7558             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
7559         }
7560       return false;
7561
7562     case CONST:
7563     case SYMBOL_REF:
7564     case LABEL_REF:
7565       /* load literal: pc-relative constant pool entry.  Only supported
7566          for SI mode or larger.  */
7567       info->type = ADDRESS_SYMBOLIC;
7568
7569       if (!load_store_pair_p
7570           && GET_MODE_SIZE (mode).is_constant (&const_size)
7571           && const_size >= 4)
7572         {
7573           rtx sym, addend;
7574
7575           split_const (x, &sym, &addend);
7576           return ((GET_CODE (sym) == LABEL_REF
7577                    || (GET_CODE (sym) == SYMBOL_REF
7578                        && CONSTANT_POOL_ADDRESS_P (sym)
7579                        && aarch64_pcrelative_literal_loads)));
7580         }
7581       return false;
7582
7583     case LO_SUM:
7584       info->type = ADDRESS_LO_SUM;
7585       info->base = XEXP (x, 0);
7586       info->offset = XEXP (x, 1);
7587       if (allow_reg_index_p
7588           && aarch64_base_register_rtx_p (info->base, strict_p))
7589         {
7590           rtx sym, offs;
7591           split_const (info->offset, &sym, &offs);
7592           if (GET_CODE (sym) == SYMBOL_REF
7593               && (aarch64_classify_symbol (sym, INTVAL (offs))
7594                   == SYMBOL_SMALL_ABSOLUTE))
7595             {
7596               /* The symbol and offset must be aligned to the access size.  */
7597               unsigned int align;
7598
7599               if (CONSTANT_POOL_ADDRESS_P (sym))
7600                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
7601               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
7602                 {
7603                   tree exp = SYMBOL_REF_DECL (sym);
7604                   align = TYPE_ALIGN (TREE_TYPE (exp));
7605                   align = aarch64_constant_alignment (exp, align);
7606                 }
7607               else if (SYMBOL_REF_DECL (sym))
7608                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
7609               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
7610                        && SYMBOL_REF_BLOCK (sym) != NULL)
7611                 align = SYMBOL_REF_BLOCK (sym)->alignment;
7612               else
7613                 align = BITS_PER_UNIT;
7614
7615               poly_int64 ref_size = GET_MODE_SIZE (mode);
7616               if (known_eq (ref_size, 0))
7617                 ref_size = GET_MODE_SIZE (DImode);
7618
7619               return (multiple_p (INTVAL (offs), ref_size)
7620                       && multiple_p (align / BITS_PER_UNIT, ref_size));
7621             }
7622         }
7623       return false;
7624
7625     default:
7626       return false;
7627     }
7628 }
7629
7630 /* Return true if the address X is valid for a PRFM instruction.
7631    STRICT_P is true if we should do strict checking with
7632    aarch64_classify_address.  */
7633
7634 bool
7635 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
7636 {
7637   struct aarch64_address_info addr;
7638
7639   /* PRFM accepts the same addresses as DImode...  */
7640   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
7641   if (!res)
7642     return false;
7643
7644   /* ... except writeback forms.  */
7645   return addr.type != ADDRESS_REG_WB;
7646 }
7647
7648 bool
7649 aarch64_symbolic_address_p (rtx x)
7650 {
7651   rtx offset;
7652
7653   split_const (x, &x, &offset);
7654   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
7655 }
7656
7657 /* Classify the base of symbolic expression X.  */
7658
7659 enum aarch64_symbol_type
7660 aarch64_classify_symbolic_expression (rtx x)
7661 {
7662   rtx offset;
7663
7664   split_const (x, &x, &offset);
7665   return aarch64_classify_symbol (x, INTVAL (offset));
7666 }
7667
7668
7669 /* Return TRUE if X is a legitimate address for accessing memory in
7670    mode MODE.  */
7671 static bool
7672 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
7673 {
7674   struct aarch64_address_info addr;
7675
7676   return aarch64_classify_address (&addr, x, mode, strict_p);
7677 }
7678
7679 /* Return TRUE if X is a legitimate address of type TYPE for accessing
7680    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
7681 bool
7682 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
7683                               aarch64_addr_query_type type)
7684 {
7685   struct aarch64_address_info addr;
7686
7687   return aarch64_classify_address (&addr, x, mode, strict_p, type);
7688 }
7689
7690 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
7691
7692 static bool
7693 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
7694                                          poly_int64 orig_offset,
7695                                          machine_mode mode)
7696 {
7697   HOST_WIDE_INT size;
7698   if (GET_MODE_SIZE (mode).is_constant (&size))
7699     {
7700       HOST_WIDE_INT const_offset, second_offset;
7701
7702       /* A general SVE offset is A * VQ + B.  Remove the A component from
7703          coefficient 0 in order to get the constant B.  */
7704       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
7705
7706       /* Split an out-of-range address displacement into a base and
7707          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
7708          range otherwise to increase opportunities for sharing the base
7709          address of different sizes.  Unaligned accesses use the signed
7710          9-bit range, TImode/TFmode use the intersection of signed
7711          scaled 7-bit and signed 9-bit offset.  */
7712       if (mode == TImode || mode == TFmode)
7713         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
7714       else if ((const_offset & (size - 1)) != 0)
7715         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
7716       else
7717         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
7718
7719       if (second_offset == 0 || known_eq (orig_offset, second_offset))
7720         return false;
7721
7722       /* Split the offset into second_offset and the rest.  */
7723       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7724       *offset2 = gen_int_mode (second_offset, Pmode);
7725       return true;
7726     }
7727   else
7728     {
7729       /* Get the mode we should use as the basis of the range.  For structure
7730          modes this is the mode of one vector.  */
7731       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7732       machine_mode step_mode
7733         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
7734
7735       /* Get the "mul vl" multiplier we'd like to use.  */
7736       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
7737       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
7738       if (vec_flags & VEC_SVE_DATA)
7739         /* LDR supports a 9-bit range, but the move patterns for
7740            structure modes require all vectors to be in range of the
7741            same base.  The simplest way of accomodating that while still
7742            promoting reuse of anchor points between different modes is
7743            to use an 8-bit range unconditionally.  */
7744         vnum = ((vnum + 128) & 255) - 128;
7745       else
7746         /* Predicates are only handled singly, so we might as well use
7747            the full range.  */
7748         vnum = ((vnum + 256) & 511) - 256;
7749       if (vnum == 0)
7750         return false;
7751
7752       /* Convert the "mul vl" multiplier into a byte offset.  */
7753       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
7754       if (known_eq (second_offset, orig_offset))
7755         return false;
7756
7757       /* Split the offset into second_offset and the rest.  */
7758       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7759       *offset2 = gen_int_mode (second_offset, Pmode);
7760       return true;
7761     }
7762 }
7763
7764 /* Return the binary representation of floating point constant VALUE in INTVAL.
7765    If the value cannot be converted, return false without setting INTVAL.
7766    The conversion is done in the given MODE.  */
7767 bool
7768 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
7769 {
7770
7771   /* We make a general exception for 0.  */
7772   if (aarch64_float_const_zero_rtx_p (value))
7773     {
7774       *intval = 0;
7775       return true;
7776     }
7777
7778   scalar_float_mode mode;
7779   if (GET_CODE (value) != CONST_DOUBLE
7780       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
7781       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
7782       /* Only support up to DF mode.  */
7783       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
7784     return false;
7785
7786   unsigned HOST_WIDE_INT ival = 0;
7787
7788   long res[2];
7789   real_to_target (res,
7790                   CONST_DOUBLE_REAL_VALUE (value),
7791                   REAL_MODE_FORMAT (mode));
7792
7793   if (mode == DFmode)
7794     {
7795       int order = BYTES_BIG_ENDIAN ? 1 : 0;
7796       ival = zext_hwi (res[order], 32);
7797       ival |= (zext_hwi (res[1 - order], 32) << 32);
7798     }
7799   else
7800       ival = zext_hwi (res[0], 32);
7801
7802   *intval = ival;
7803   return true;
7804 }
7805
7806 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7807    single MOV(+MOVK) followed by an FMOV.  */
7808 bool
7809 aarch64_float_const_rtx_p (rtx x)
7810 {
7811   machine_mode mode = GET_MODE (x);
7812   if (mode == VOIDmode)
7813     return false;
7814
7815   /* Determine whether it's cheaper to write float constants as
7816      mov/movk pairs over ldr/adrp pairs.  */
7817   unsigned HOST_WIDE_INT ival;
7818
7819   if (GET_CODE (x) == CONST_DOUBLE
7820       && SCALAR_FLOAT_MODE_P (mode)
7821       && aarch64_reinterpret_float_as_int (x, &ival))
7822     {
7823       scalar_int_mode imode = (mode == HFmode
7824                                ? SImode
7825                                : int_mode_for_mode (mode).require ());
7826       int num_instr = aarch64_internal_mov_immediate
7827                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7828       return num_instr < 3;
7829     }
7830
7831   return false;
7832 }
7833
7834 /* Return TRUE if rtx X is immediate constant 0.0 */
7835 bool
7836 aarch64_float_const_zero_rtx_p (rtx x)
7837 {
7838   if (GET_MODE (x) == VOIDmode)
7839     return false;
7840
7841   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7842     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7843   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7844 }
7845
7846 /* Return TRUE if rtx X is immediate constant that fits in a single
7847    MOVI immediate operation.  */
7848 bool
7849 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7850 {
7851   if (!TARGET_SIMD)
7852      return false;
7853
7854   machine_mode vmode;
7855   scalar_int_mode imode;
7856   unsigned HOST_WIDE_INT ival;
7857
7858   if (GET_CODE (x) == CONST_DOUBLE
7859       && SCALAR_FLOAT_MODE_P (mode))
7860     {
7861       if (!aarch64_reinterpret_float_as_int (x, &ival))
7862         return false;
7863
7864       /* We make a general exception for 0.  */
7865       if (aarch64_float_const_zero_rtx_p (x))
7866         return true;
7867
7868       imode = int_mode_for_mode (mode).require ();
7869     }
7870   else if (GET_CODE (x) == CONST_INT
7871            && is_a <scalar_int_mode> (mode, &imode))
7872     ival = INTVAL (x);
7873   else
7874     return false;
7875
7876    /* use a 64 bit mode for everything except for DI/DF mode, where we use
7877      a 128 bit vector mode.  */
7878   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7879
7880   vmode = aarch64_simd_container_mode (imode, width);
7881   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7882
7883   return aarch64_simd_valid_immediate (v_op, NULL);
7884 }
7885
7886
7887 /* Return the fixed registers used for condition codes.  */
7888
7889 static bool
7890 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7891 {
7892   *p1 = CC_REGNUM;
7893   *p2 = INVALID_REGNUM;
7894   return true;
7895 }
7896
7897 /* This function is used by the call expanders of the machine description.
7898    RESULT is the register in which the result is returned.  It's NULL for
7899    "call" and "sibcall".
7900    MEM is the location of the function call.
7901    SIBCALL indicates whether this function call is normal call or sibling call.
7902    It will generate different pattern accordingly.  */
7903
7904 void
7905 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
7906 {
7907   rtx call, callee, tmp;
7908   rtvec vec;
7909   machine_mode mode;
7910
7911   gcc_assert (MEM_P (mem));
7912   callee = XEXP (mem, 0);
7913   mode = GET_MODE (callee);
7914   gcc_assert (mode == Pmode);
7915
7916   /* Decide if we should generate indirect calls by loading the
7917      address of the callee into a register before performing
7918      the branch-and-link.  */
7919   if (SYMBOL_REF_P (callee)
7920       ? (aarch64_is_long_call_p (callee)
7921          || aarch64_is_noplt_call_p (callee))
7922       : !REG_P (callee))
7923     XEXP (mem, 0) = force_reg (mode, callee);
7924
7925   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
7926
7927   if (result != NULL_RTX)
7928     call = gen_rtx_SET (result, call);
7929
7930   if (sibcall)
7931     tmp = ret_rtx;
7932   else
7933     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
7934
7935   vec = gen_rtvec (2, call, tmp);
7936   call = gen_rtx_PARALLEL (VOIDmode, vec);
7937
7938   aarch64_emit_call_insn (call);
7939 }
7940
7941 /* Emit call insn with PAT and do aarch64-specific handling.  */
7942
7943 void
7944 aarch64_emit_call_insn (rtx pat)
7945 {
7946   rtx insn = emit_call_insn (pat);
7947
7948   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
7949   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
7950   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
7951 }
7952
7953 machine_mode
7954 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
7955 {
7956   machine_mode mode_x = GET_MODE (x);
7957   rtx_code code_x = GET_CODE (x);
7958
7959   /* All floating point compares return CCFP if it is an equality
7960      comparison, and CCFPE otherwise.  */
7961   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
7962     {
7963       switch (code)
7964         {
7965         case EQ:
7966         case NE:
7967         case UNORDERED:
7968         case ORDERED:
7969         case UNLT:
7970         case UNLE:
7971         case UNGT:
7972         case UNGE:
7973         case UNEQ:
7974           return CCFPmode;
7975
7976         case LT:
7977         case LE:
7978         case GT:
7979         case GE:
7980         case LTGT:
7981           return CCFPEmode;
7982
7983         default:
7984           gcc_unreachable ();
7985         }
7986     }
7987
7988   /* Equality comparisons of short modes against zero can be performed
7989      using the TST instruction with the appropriate bitmask.  */
7990   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
7991       && (code == EQ || code == NE)
7992       && (mode_x == HImode || mode_x == QImode))
7993     return CC_NZmode;
7994
7995   /* Similarly, comparisons of zero_extends from shorter modes can
7996      be performed using an ANDS with an immediate mask.  */
7997   if (y == const0_rtx && code_x == ZERO_EXTEND
7998       && (mode_x == SImode || mode_x == DImode)
7999       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
8000       && (code == EQ || code == NE))
8001     return CC_NZmode;
8002
8003   if ((mode_x == SImode || mode_x == DImode)
8004       && y == const0_rtx
8005       && (code == EQ || code == NE || code == LT || code == GE)
8006       && (code_x == PLUS || code_x == MINUS || code_x == AND
8007           || code_x == NEG
8008           || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
8009               && CONST_INT_P (XEXP (x, 2)))))
8010     return CC_NZmode;
8011
8012   /* A compare with a shifted operand.  Because of canonicalization,
8013      the comparison will have to be swapped when we emit the assembly
8014      code.  */
8015   if ((mode_x == SImode || mode_x == DImode)
8016       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
8017       && (code_x == ASHIFT || code_x == ASHIFTRT
8018           || code_x == LSHIFTRT
8019           || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
8020     return CC_SWPmode;
8021
8022   /* Similarly for a negated operand, but we can only do this for
8023      equalities.  */
8024   if ((mode_x == SImode || mode_x == DImode)
8025       && (REG_P (y) || GET_CODE (y) == SUBREG)
8026       && (code == EQ || code == NE)
8027       && code_x == NEG)
8028     return CC_Zmode;
8029
8030   /* A test for unsigned overflow from an addition.  */
8031   if ((mode_x == DImode || mode_x == TImode)
8032       && (code == LTU || code == GEU)
8033       && code_x == PLUS
8034       && rtx_equal_p (XEXP (x, 0), y))
8035     return CC_Cmode;
8036
8037   /* A test for unsigned overflow from an add with carry.  */
8038   if ((mode_x == DImode || mode_x == TImode)
8039       && (code == LTU || code == GEU)
8040       && code_x == PLUS
8041       && CONST_SCALAR_INT_P (y)
8042       && (rtx_mode_t (y, mode_x)
8043           == (wi::shwi (1, mode_x)
8044               << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
8045     return CC_ADCmode;
8046
8047   /* A test for signed overflow.  */
8048   if ((mode_x == DImode || mode_x == TImode)
8049       && code == NE
8050       && code_x == PLUS
8051       && GET_CODE (y) == SIGN_EXTEND)
8052     return CC_Vmode;
8053
8054   /* For everything else, return CCmode.  */
8055   return CCmode;
8056 }
8057
8058 static int
8059 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
8060
8061 int
8062 aarch64_get_condition_code (rtx x)
8063 {
8064   machine_mode mode = GET_MODE (XEXP (x, 0));
8065   enum rtx_code comp_code = GET_CODE (x);
8066
8067   if (GET_MODE_CLASS (mode) != MODE_CC)
8068     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
8069   return aarch64_get_condition_code_1 (mode, comp_code);
8070 }
8071
8072 static int
8073 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
8074 {
8075   switch (mode)
8076     {
8077     case E_CCFPmode:
8078     case E_CCFPEmode:
8079       switch (comp_code)
8080         {
8081         case GE: return AARCH64_GE;
8082         case GT: return AARCH64_GT;
8083         case LE: return AARCH64_LS;
8084         case LT: return AARCH64_MI;
8085         case NE: return AARCH64_NE;
8086         case EQ: return AARCH64_EQ;
8087         case ORDERED: return AARCH64_VC;
8088         case UNORDERED: return AARCH64_VS;
8089         case UNLT: return AARCH64_LT;
8090         case UNLE: return AARCH64_LE;
8091         case UNGT: return AARCH64_HI;
8092         case UNGE: return AARCH64_PL;
8093         default: return -1;
8094         }
8095       break;
8096
8097     case E_CCmode:
8098       switch (comp_code)
8099         {
8100         case NE: return AARCH64_NE;
8101         case EQ: return AARCH64_EQ;
8102         case GE: return AARCH64_GE;
8103         case GT: return AARCH64_GT;
8104         case LE: return AARCH64_LE;
8105         case LT: return AARCH64_LT;
8106         case GEU: return AARCH64_CS;
8107         case GTU: return AARCH64_HI;
8108         case LEU: return AARCH64_LS;
8109         case LTU: return AARCH64_CC;
8110         default: return -1;
8111         }
8112       break;
8113
8114     case E_CC_SWPmode:
8115       switch (comp_code)
8116         {
8117         case NE: return AARCH64_NE;
8118         case EQ: return AARCH64_EQ;
8119         case GE: return AARCH64_LE;
8120         case GT: return AARCH64_LT;
8121         case LE: return AARCH64_GE;
8122         case LT: return AARCH64_GT;
8123         case GEU: return AARCH64_LS;
8124         case GTU: return AARCH64_CC;
8125         case LEU: return AARCH64_CS;
8126         case LTU: return AARCH64_HI;
8127         default: return -1;
8128         }
8129       break;
8130
8131     case E_CC_NZCmode:
8132       switch (comp_code)
8133         {
8134         case NE: return AARCH64_NE; /* = any */
8135         case EQ: return AARCH64_EQ; /* = none */
8136         case GE: return AARCH64_PL; /* = nfrst */
8137         case LT: return AARCH64_MI; /* = first */
8138         case GEU: return AARCH64_CS; /* = nlast */
8139         case GTU: return AARCH64_HI; /* = pmore */
8140         case LEU: return AARCH64_LS; /* = plast */
8141         case LTU: return AARCH64_CC; /* = last */
8142         default: return -1;
8143         }
8144       break;
8145
8146     case E_CC_NZmode:
8147       switch (comp_code)
8148         {
8149         case NE: return AARCH64_NE;
8150         case EQ: return AARCH64_EQ;
8151         case GE: return AARCH64_PL;
8152         case LT: return AARCH64_MI;
8153         default: return -1;
8154         }
8155       break;
8156
8157     case E_CC_Zmode:
8158       switch (comp_code)
8159         {
8160         case NE: return AARCH64_NE;
8161         case EQ: return AARCH64_EQ;
8162         default: return -1;
8163         }
8164       break;
8165
8166     case E_CC_Cmode:
8167       switch (comp_code)
8168         {
8169         case LTU: return AARCH64_CS;
8170         case GEU: return AARCH64_CC;
8171         default: return -1;
8172         }
8173       break;
8174
8175     case E_CC_ADCmode:
8176       switch (comp_code)
8177         {
8178         case GEU: return AARCH64_CS;
8179         case LTU: return AARCH64_CC;
8180         default: return -1;
8181         }
8182       break;
8183
8184     case E_CC_Vmode:
8185       switch (comp_code)
8186         {
8187         case NE: return AARCH64_VS;
8188         case EQ: return AARCH64_VC;
8189         default: return -1;
8190         }
8191       break;
8192
8193     default:
8194       return -1;
8195     }
8196
8197   return -1;
8198 }
8199
8200 bool
8201 aarch64_const_vec_all_same_in_range_p (rtx x,
8202                                        HOST_WIDE_INT minval,
8203                                        HOST_WIDE_INT maxval)
8204 {
8205   rtx elt;
8206   return (const_vec_duplicate_p (x, &elt)
8207           && CONST_INT_P (elt)
8208           && IN_RANGE (INTVAL (elt), minval, maxval));
8209 }
8210
8211 bool
8212 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
8213 {
8214   return aarch64_const_vec_all_same_in_range_p (x, val, val);
8215 }
8216
8217 /* Return true if VEC is a constant in which every element is in the range
8218    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
8219
8220 static bool
8221 aarch64_const_vec_all_in_range_p (rtx vec,
8222                                   HOST_WIDE_INT minval,
8223                                   HOST_WIDE_INT maxval)
8224 {
8225   if (GET_CODE (vec) != CONST_VECTOR
8226       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
8227     return false;
8228
8229   int nunits;
8230   if (!CONST_VECTOR_STEPPED_P (vec))
8231     nunits = const_vector_encoded_nelts (vec);
8232   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
8233     return false;
8234
8235   for (int i = 0; i < nunits; i++)
8236     {
8237       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
8238       if (!CONST_INT_P (vec_elem)
8239           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
8240         return false;
8241     }
8242   return true;
8243 }
8244
8245 /* N Z C V.  */
8246 #define AARCH64_CC_V 1
8247 #define AARCH64_CC_C (1 << 1)
8248 #define AARCH64_CC_Z (1 << 2)
8249 #define AARCH64_CC_N (1 << 3)
8250
8251 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
8252 static const int aarch64_nzcv_codes[] =
8253 {
8254   0,            /* EQ, Z == 1.  */
8255   AARCH64_CC_Z, /* NE, Z == 0.  */
8256   0,            /* CS, C == 1.  */
8257   AARCH64_CC_C, /* CC, C == 0.  */
8258   0,            /* MI, N == 1.  */
8259   AARCH64_CC_N, /* PL, N == 0.  */
8260   0,            /* VS, V == 1.  */
8261   AARCH64_CC_V, /* VC, V == 0.  */
8262   0,            /* HI, C ==1 && Z == 0.  */
8263   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
8264   AARCH64_CC_V, /* GE, N == V.  */
8265   0,            /* LT, N != V.  */
8266   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
8267   0,            /* LE, !(Z == 0 && N == V).  */
8268   0,            /* AL, Any.  */
8269   0             /* NV, Any.  */
8270 };
8271
8272 /* Print floating-point vector immediate operand X to F, negating it
8273    first if NEGATE is true.  Return true on success, false if it isn't
8274    a constant we can handle.  */
8275
8276 static bool
8277 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
8278 {
8279   rtx elt;
8280
8281   if (!const_vec_duplicate_p (x, &elt))
8282     return false;
8283
8284   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
8285   if (negate)
8286     r = real_value_negate (&r);
8287
8288   /* Handle the SVE single-bit immediates specially, since they have a
8289      fixed form in the assembly syntax.  */
8290   if (real_equal (&r, &dconst0))
8291     asm_fprintf (f, "0.0");
8292   else if (real_equal (&r, &dconst1))
8293     asm_fprintf (f, "1.0");
8294   else if (real_equal (&r, &dconsthalf))
8295     asm_fprintf (f, "0.5");
8296   else
8297     {
8298       const int buf_size = 20;
8299       char float_buf[buf_size] = {'\0'};
8300       real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
8301                                 1, GET_MODE (elt));
8302       asm_fprintf (f, "%s", float_buf);
8303     }
8304
8305   return true;
8306 }
8307
8308 /* Return the equivalent letter for size.  */
8309 static char
8310 sizetochar (int size)
8311 {
8312   switch (size)
8313     {
8314     case 64: return 'd';
8315     case 32: return 's';
8316     case 16: return 'h';
8317     case 8 : return 'b';
8318     default: gcc_unreachable ();
8319     }
8320 }
8321
8322 /* Print operand X to file F in a target specific manner according to CODE.
8323    The acceptable formatting commands given by CODE are:
8324      'c':               An integer or symbol address without a preceding #
8325                         sign.
8326      'C':               Take the duplicated element in a vector constant
8327                         and print it in hex.
8328      'D':               Take the duplicated element in a vector constant
8329                         and print it as an unsigned integer, in decimal.
8330      'e':               Print the sign/zero-extend size as a character 8->b,
8331                         16->h, 32->w.
8332      'I':               If the operand is a duplicated vector constant,
8333                         replace it with the duplicated scalar.  If the
8334                         operand is then a floating-point constant, replace
8335                         it with the integer bit representation.  Print the
8336                         transformed constant as a signed decimal number.
8337      'p':               Prints N such that 2^N == X (X must be power of 2 and
8338                         const int).
8339      'P':               Print the number of non-zero bits in X (a const_int).
8340      'H':               Print the higher numbered register of a pair (TImode)
8341                         of regs.
8342      'm':               Print a condition (eq, ne, etc).
8343      'M':               Same as 'm', but invert condition.
8344      'N':               Take the duplicated element in a vector constant
8345                         and print the negative of it in decimal.
8346      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
8347      'S/T/U/V':         Print a FP/SIMD register name for a register list.
8348                         The register printed is the FP/SIMD register name
8349                         of X + 0/1/2/3 for S/T/U/V.
8350      'R':               Print a scalar FP/SIMD register name + 1.
8351      'X':               Print bottom 16 bits of integer constant in hex.
8352      'w/x':             Print a general register name or the zero register
8353                         (32-bit or 64-bit).
8354      '0':               Print a normal operand, if it's a general register,
8355                         then we assume DImode.
8356      'k':               Print NZCV for conditional compare instructions.
8357      'A':               Output address constant representing the first
8358                         argument of X, specifying a relocation offset
8359                         if appropriate.
8360      'L':               Output constant address specified by X
8361                         with a relocation offset if appropriate.
8362      'G':               Prints address of X, specifying a PC relative
8363                         relocation mode if appropriate.
8364      'y':               Output address of LDP or STP - this is used for
8365                         some LDP/STPs which don't use a PARALLEL in their
8366                         pattern (so the mode needs to be adjusted).
8367      'z':               Output address of a typical LDP or STP.  */
8368
8369 static void
8370 aarch64_print_operand (FILE *f, rtx x, int code)
8371 {
8372   rtx elt;
8373   switch (code)
8374     {
8375     case 'c':
8376       switch (GET_CODE (x))
8377         {
8378         case CONST_INT:
8379           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
8380           break;
8381
8382         case SYMBOL_REF:
8383           output_addr_const (f, x);
8384           break;
8385
8386         case CONST:
8387           if (GET_CODE (XEXP (x, 0)) == PLUS
8388               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
8389             {
8390               output_addr_const (f, x);
8391               break;
8392             }
8393           /* Fall through.  */
8394
8395         default:
8396           output_operand_lossage ("unsupported operand for code '%c'", code);
8397         }
8398       break;
8399
8400     case 'e':
8401       {
8402         int n;
8403
8404         if (!CONST_INT_P (x)
8405             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
8406           {
8407             output_operand_lossage ("invalid operand for '%%%c'", code);
8408             return;
8409           }
8410
8411         switch (n)
8412           {
8413           case 3:
8414             fputc ('b', f);
8415             break;
8416           case 4:
8417             fputc ('h', f);
8418             break;
8419           case 5:
8420             fputc ('w', f);
8421             break;
8422           default:
8423             output_operand_lossage ("invalid operand for '%%%c'", code);
8424             return;
8425           }
8426       }
8427       break;
8428
8429     case 'p':
8430       {
8431         int n;
8432
8433         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
8434           {
8435             output_operand_lossage ("invalid operand for '%%%c'", code);
8436             return;
8437           }
8438
8439         asm_fprintf (f, "%d", n);
8440       }
8441       break;
8442
8443     case 'P':
8444       if (!CONST_INT_P (x))
8445         {
8446           output_operand_lossage ("invalid operand for '%%%c'", code);
8447           return;
8448         }
8449
8450       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
8451       break;
8452
8453     case 'H':
8454       if (x == const0_rtx)
8455         {
8456           asm_fprintf (f, "xzr");
8457           break;
8458         }
8459
8460       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
8461         {
8462           output_operand_lossage ("invalid operand for '%%%c'", code);
8463           return;
8464         }
8465
8466       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
8467       break;
8468
8469     case 'I':
8470       {
8471         x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
8472         if (CONST_INT_P (x))
8473           asm_fprintf (f, "%wd", INTVAL (x));
8474         else
8475           {
8476             output_operand_lossage ("invalid operand for '%%%c'", code);
8477             return;
8478           }
8479         break;
8480       }
8481
8482     case 'M':
8483     case 'm':
8484       {
8485         int cond_code;
8486         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
8487         if (x == const_true_rtx)
8488           {
8489             if (code == 'M')
8490               fputs ("nv", f);
8491             return;
8492           }
8493
8494         if (!COMPARISON_P (x))
8495           {
8496             output_operand_lossage ("invalid operand for '%%%c'", code);
8497             return;
8498           }
8499
8500         cond_code = aarch64_get_condition_code (x);
8501         gcc_assert (cond_code >= 0);
8502         if (code == 'M')
8503           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
8504         if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
8505           fputs (aarch64_sve_condition_codes[cond_code], f);
8506         else
8507           fputs (aarch64_condition_codes[cond_code], f);
8508       }
8509       break;
8510
8511     case 'N':
8512       if (!const_vec_duplicate_p (x, &elt))
8513         {
8514           output_operand_lossage ("invalid vector constant");
8515           return;
8516         }
8517
8518       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8519         asm_fprintf (f, "%wd", -INTVAL (elt));
8520       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8521                && aarch64_print_vector_float_operand (f, x, true))
8522         ;
8523       else
8524         {
8525           output_operand_lossage ("invalid vector constant");
8526           return;
8527         }
8528       break;
8529
8530     case 'b':
8531     case 'h':
8532     case 's':
8533     case 'd':
8534     case 'q':
8535       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8536         {
8537           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8538           return;
8539         }
8540       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
8541       break;
8542
8543     case 'S':
8544     case 'T':
8545     case 'U':
8546     case 'V':
8547       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8548         {
8549           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8550           return;
8551         }
8552       asm_fprintf (f, "%c%d",
8553                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
8554                    REGNO (x) - V0_REGNUM + (code - 'S'));
8555       break;
8556
8557     case 'R':
8558       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8559         {
8560           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8561           return;
8562         }
8563       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
8564       break;
8565
8566     case 'X':
8567       if (!CONST_INT_P (x))
8568         {
8569           output_operand_lossage ("invalid operand for '%%%c'", code);
8570           return;
8571         }
8572       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
8573       break;
8574
8575     case 'C':
8576       {
8577         /* Print a replicated constant in hex.  */
8578         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8579           {
8580             output_operand_lossage ("invalid operand for '%%%c'", code);
8581             return;
8582           }
8583         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8584         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8585       }
8586       break;
8587
8588     case 'D':
8589       {
8590         /* Print a replicated constant in decimal, treating it as
8591            unsigned.  */
8592         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8593           {
8594             output_operand_lossage ("invalid operand for '%%%c'", code);
8595             return;
8596           }
8597         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8598         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8599       }
8600       break;
8601
8602     case 'w':
8603     case 'x':
8604       if (x == const0_rtx
8605           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
8606         {
8607           asm_fprintf (f, "%czr", code);
8608           break;
8609         }
8610
8611       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
8612         {
8613           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
8614           break;
8615         }
8616
8617       if (REG_P (x) && REGNO (x) == SP_REGNUM)
8618         {
8619           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
8620           break;
8621         }
8622
8623       /* Fall through */
8624
8625     case 0:
8626       if (x == NULL)
8627         {
8628           output_operand_lossage ("missing operand");
8629           return;
8630         }
8631
8632       switch (GET_CODE (x))
8633         {
8634         case REG:
8635           if (aarch64_sve_data_mode_p (GET_MODE (x)))
8636             {
8637               if (REG_NREGS (x) == 1)
8638                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
8639               else
8640                 {
8641                   char suffix
8642                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
8643                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
8644                                REGNO (x) - V0_REGNUM, suffix,
8645                                END_REGNO (x) - V0_REGNUM - 1, suffix);
8646                 }
8647             }
8648           else
8649             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
8650           break;
8651
8652         case MEM:
8653           output_address (GET_MODE (x), XEXP (x, 0));
8654           break;
8655
8656         case LABEL_REF:
8657         case SYMBOL_REF:
8658           output_addr_const (asm_out_file, x);
8659           break;
8660
8661         case CONST_INT:
8662           asm_fprintf (f, "%wd", INTVAL (x));
8663           break;
8664
8665         case CONST:
8666           if (!VECTOR_MODE_P (GET_MODE (x)))
8667             {
8668               output_addr_const (asm_out_file, x);
8669               break;
8670             }
8671           /* fall through */
8672
8673         case CONST_VECTOR:
8674           if (!const_vec_duplicate_p (x, &elt))
8675             {
8676               output_operand_lossage ("invalid vector constant");
8677               return;
8678             }
8679
8680           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8681             asm_fprintf (f, "%wd", INTVAL (elt));
8682           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8683                    && aarch64_print_vector_float_operand (f, x, false))
8684             ;
8685           else
8686             {
8687               output_operand_lossage ("invalid vector constant");
8688               return;
8689             }
8690           break;
8691
8692         case CONST_DOUBLE:
8693           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
8694              be getting CONST_DOUBLEs holding integers.  */
8695           gcc_assert (GET_MODE (x) != VOIDmode);
8696           if (aarch64_float_const_zero_rtx_p (x))
8697             {
8698               fputc ('0', f);
8699               break;
8700             }
8701           else if (aarch64_float_const_representable_p (x))
8702             {
8703 #define buf_size 20
8704               char float_buf[buf_size] = {'\0'};
8705               real_to_decimal_for_mode (float_buf,
8706                                         CONST_DOUBLE_REAL_VALUE (x),
8707                                         buf_size, buf_size,
8708                                         1, GET_MODE (x));
8709               asm_fprintf (asm_out_file, "%s", float_buf);
8710               break;
8711 #undef buf_size
8712             }
8713           output_operand_lossage ("invalid constant");
8714           return;
8715         default:
8716           output_operand_lossage ("invalid operand");
8717           return;
8718         }
8719       break;
8720
8721     case 'A':
8722       if (GET_CODE (x) == HIGH)
8723         x = XEXP (x, 0);
8724
8725       switch (aarch64_classify_symbolic_expression (x))
8726         {
8727         case SYMBOL_SMALL_GOT_4G:
8728           asm_fprintf (asm_out_file, ":got:");
8729           break;
8730
8731         case SYMBOL_SMALL_TLSGD:
8732           asm_fprintf (asm_out_file, ":tlsgd:");
8733           break;
8734
8735         case SYMBOL_SMALL_TLSDESC:
8736           asm_fprintf (asm_out_file, ":tlsdesc:");
8737           break;
8738
8739         case SYMBOL_SMALL_TLSIE:
8740           asm_fprintf (asm_out_file, ":gottprel:");
8741           break;
8742
8743         case SYMBOL_TLSLE24:
8744           asm_fprintf (asm_out_file, ":tprel:");
8745           break;
8746
8747         case SYMBOL_TINY_GOT:
8748           gcc_unreachable ();
8749           break;
8750
8751         default:
8752           break;
8753         }
8754       output_addr_const (asm_out_file, x);
8755       break;
8756
8757     case 'L':
8758       switch (aarch64_classify_symbolic_expression (x))
8759         {
8760         case SYMBOL_SMALL_GOT_4G:
8761           asm_fprintf (asm_out_file, ":lo12:");
8762           break;
8763
8764         case SYMBOL_SMALL_TLSGD:
8765           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
8766           break;
8767
8768         case SYMBOL_SMALL_TLSDESC:
8769           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
8770           break;
8771
8772         case SYMBOL_SMALL_TLSIE:
8773           asm_fprintf (asm_out_file, ":gottprel_lo12:");
8774           break;
8775
8776         case SYMBOL_TLSLE12:
8777           asm_fprintf (asm_out_file, ":tprel_lo12:");
8778           break;
8779
8780         case SYMBOL_TLSLE24:
8781           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
8782           break;
8783
8784         case SYMBOL_TINY_GOT:
8785           asm_fprintf (asm_out_file, ":got:");
8786           break;
8787
8788         case SYMBOL_TINY_TLSIE:
8789           asm_fprintf (asm_out_file, ":gottprel:");
8790           break;
8791
8792         default:
8793           break;
8794         }
8795       output_addr_const (asm_out_file, x);
8796       break;
8797
8798     case 'G':
8799       switch (aarch64_classify_symbolic_expression (x))
8800         {
8801         case SYMBOL_TLSLE24:
8802           asm_fprintf (asm_out_file, ":tprel_hi12:");
8803           break;
8804         default:
8805           break;
8806         }
8807       output_addr_const (asm_out_file, x);
8808       break;
8809
8810     case 'k':
8811       {
8812         HOST_WIDE_INT cond_code;
8813
8814         if (!CONST_INT_P (x))
8815           {
8816             output_operand_lossage ("invalid operand for '%%%c'", code);
8817             return;
8818           }
8819
8820         cond_code = INTVAL (x);
8821         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
8822         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
8823       }
8824       break;
8825
8826     case 'y':
8827     case 'z':
8828       {
8829         machine_mode mode = GET_MODE (x);
8830
8831         if (GET_CODE (x) != MEM
8832             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
8833           {
8834             output_operand_lossage ("invalid operand for '%%%c'", code);
8835             return;
8836           }
8837
8838         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
8839                                             code == 'y'
8840                                             ? ADDR_QUERY_LDP_STP_N
8841                                             : ADDR_QUERY_LDP_STP))
8842           output_operand_lossage ("invalid operand prefix '%%%c'", code);
8843       }
8844       break;
8845
8846     default:
8847       output_operand_lossage ("invalid operand prefix '%%%c'", code);
8848       return;
8849     }
8850 }
8851
8852 /* Print address 'x' of a memory access with mode 'mode'.
8853    'op' is the context required by aarch64_classify_address.  It can either be
8854    MEM for a normal memory access or PARALLEL for LDP/STP.  */
8855 static bool
8856 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
8857                                 aarch64_addr_query_type type)
8858 {
8859   struct aarch64_address_info addr;
8860   unsigned int size;
8861
8862   /* Check all addresses are Pmode - including ILP32.  */
8863   if (GET_MODE (x) != Pmode
8864       && (!CONST_INT_P (x)
8865           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
8866     {
8867       output_operand_lossage ("invalid address mode");
8868       return false;
8869     }
8870
8871   if (aarch64_classify_address (&addr, x, mode, true, type))
8872     switch (addr.type)
8873       {
8874       case ADDRESS_REG_IMM:
8875         if (known_eq (addr.const_offset, 0))
8876           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8877         else if (aarch64_sve_data_mode_p (mode))
8878           {
8879             HOST_WIDE_INT vnum
8880               = exact_div (addr.const_offset,
8881                            BYTES_PER_SVE_VECTOR).to_constant ();
8882             asm_fprintf (f, "[%s, #%wd, mul vl]",
8883                          reg_names[REGNO (addr.base)], vnum);
8884           }
8885         else if (aarch64_sve_pred_mode_p (mode))
8886           {
8887             HOST_WIDE_INT vnum
8888               = exact_div (addr.const_offset,
8889                            BYTES_PER_SVE_PRED).to_constant ();
8890             asm_fprintf (f, "[%s, #%wd, mul vl]",
8891                          reg_names[REGNO (addr.base)], vnum);
8892           }
8893         else
8894           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
8895                        INTVAL (addr.offset));
8896         return true;
8897
8898       case ADDRESS_REG_REG:
8899         if (addr.shift == 0)
8900           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
8901                        reg_names [REGNO (addr.offset)]);
8902         else
8903           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
8904                        reg_names [REGNO (addr.offset)], addr.shift);
8905         return true;
8906
8907       case ADDRESS_REG_UXTW:
8908         if (addr.shift == 0)
8909           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
8910                        REGNO (addr.offset) - R0_REGNUM);
8911         else
8912           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
8913                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
8914         return true;
8915
8916       case ADDRESS_REG_SXTW:
8917         if (addr.shift == 0)
8918           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
8919                        REGNO (addr.offset) - R0_REGNUM);
8920         else
8921           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
8922                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
8923         return true;
8924
8925       case ADDRESS_REG_WB:
8926         /* Writeback is only supported for fixed-width modes.  */
8927         size = GET_MODE_SIZE (mode).to_constant ();
8928         switch (GET_CODE (x))
8929           {
8930           case PRE_INC:
8931             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
8932             return true;
8933           case POST_INC:
8934             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
8935             return true;
8936           case PRE_DEC:
8937             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
8938             return true;
8939           case POST_DEC:
8940             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
8941             return true;
8942           case PRE_MODIFY:
8943             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
8944                          INTVAL (addr.offset));
8945             return true;
8946           case POST_MODIFY:
8947             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
8948                          INTVAL (addr.offset));
8949             return true;
8950           default:
8951             break;
8952           }
8953         break;
8954
8955       case ADDRESS_LO_SUM:
8956         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
8957         output_addr_const (f, addr.offset);
8958         asm_fprintf (f, "]");
8959         return true;
8960
8961       case ADDRESS_SYMBOLIC:
8962         output_addr_const (f, x);
8963         return true;
8964       }
8965
8966   return false;
8967 }
8968
8969 /* Print address 'x' of a memory access with mode 'mode'.  */
8970 static void
8971 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
8972 {
8973   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
8974     output_addr_const (f, x);
8975 }
8976
8977 bool
8978 aarch64_label_mentioned_p (rtx x)
8979 {
8980   const char *fmt;
8981   int i;
8982
8983   if (GET_CODE (x) == LABEL_REF)
8984     return true;
8985
8986   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
8987      referencing instruction, but they are constant offsets, not
8988      symbols.  */
8989   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8990     return false;
8991
8992   fmt = GET_RTX_FORMAT (GET_CODE (x));
8993   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
8994     {
8995       if (fmt[i] == 'E')
8996         {
8997           int j;
8998
8999           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
9000             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
9001               return 1;
9002         }
9003       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
9004         return 1;
9005     }
9006
9007   return 0;
9008 }
9009
9010 /* Implement REGNO_REG_CLASS.  */
9011
9012 enum reg_class
9013 aarch64_regno_regclass (unsigned regno)
9014 {
9015   if (GP_REGNUM_P (regno))
9016     return GENERAL_REGS;
9017
9018   if (regno == SP_REGNUM)
9019     return STACK_REG;
9020
9021   if (regno == FRAME_POINTER_REGNUM
9022       || regno == ARG_POINTER_REGNUM)
9023     return POINTER_REGS;
9024
9025   if (FP_REGNUM_P (regno))
9026     return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
9027             : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
9028
9029   if (PR_REGNUM_P (regno))
9030     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
9031
9032   return NO_REGS;
9033 }
9034
9035 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
9036    If OFFSET is out of range, return an offset of an anchor point
9037    that is in range.  Return 0 otherwise.  */
9038
9039 static HOST_WIDE_INT
9040 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
9041                        machine_mode mode)
9042 {
9043   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
9044   if (size > 16)
9045     return (offset + 0x400) & ~0x7f0;
9046
9047   /* For offsets that aren't a multiple of the access size, the limit is
9048      -256...255.  */
9049   if (offset & (size - 1))
9050     {
9051       /* BLKmode typically uses LDP of X-registers.  */
9052       if (mode == BLKmode)
9053         return (offset + 512) & ~0x3ff;
9054       return (offset + 0x100) & ~0x1ff;
9055     }
9056
9057   /* Small negative offsets are supported.  */
9058   if (IN_RANGE (offset, -256, 0))
9059     return 0;
9060
9061   if (mode == TImode || mode == TFmode)
9062     return (offset + 0x100) & ~0x1ff;
9063
9064   /* Use 12-bit offset by access size.  */
9065   return offset & (~0xfff * size);
9066 }
9067
9068 static rtx
9069 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
9070 {
9071   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
9072      where mask is selected by alignment and size of the offset.
9073      We try to pick as large a range for the offset as possible to
9074      maximize the chance of a CSE.  However, for aligned addresses
9075      we limit the range to 4k so that structures with different sized
9076      elements are likely to use the same base.  We need to be careful
9077      not to split a CONST for some forms of address expression, otherwise
9078      it will generate sub-optimal code.  */
9079
9080   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
9081     {
9082       rtx base = XEXP (x, 0);
9083       rtx offset_rtx = XEXP (x, 1);
9084       HOST_WIDE_INT offset = INTVAL (offset_rtx);
9085
9086       if (GET_CODE (base) == PLUS)
9087         {
9088           rtx op0 = XEXP (base, 0);
9089           rtx op1 = XEXP (base, 1);
9090
9091           /* Force any scaling into a temp for CSE.  */
9092           op0 = force_reg (Pmode, op0);
9093           op1 = force_reg (Pmode, op1);
9094
9095           /* Let the pointer register be in op0.  */
9096           if (REG_POINTER (op1))
9097             std::swap (op0, op1);
9098
9099           /* If the pointer is virtual or frame related, then we know that
9100              virtual register instantiation or register elimination is going
9101              to apply a second constant.  We want the two constants folded
9102              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
9103           if (virt_or_elim_regno_p (REGNO (op0)))
9104             {
9105               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
9106                                    NULL_RTX, true, OPTAB_DIRECT);
9107               return gen_rtx_PLUS (Pmode, base, op1);
9108             }
9109
9110           /* Otherwise, in order to encourage CSE (and thence loop strength
9111              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
9112           base = expand_binop (Pmode, add_optab, op0, op1,
9113                                NULL_RTX, true, OPTAB_DIRECT);
9114           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
9115         }
9116
9117       HOST_WIDE_INT size;
9118       if (GET_MODE_SIZE (mode).is_constant (&size))
9119         {
9120           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
9121                                                              mode);
9122           if (base_offset != 0)
9123             {
9124               base = plus_constant (Pmode, base, base_offset);
9125               base = force_operand (base, NULL_RTX);
9126               return plus_constant (Pmode, base, offset - base_offset);
9127             }
9128         }
9129     }
9130
9131   return x;
9132 }
9133
9134 static reg_class_t
9135 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
9136                           reg_class_t rclass,
9137                           machine_mode mode,
9138                           secondary_reload_info *sri)
9139 {
9140   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
9141      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
9142      comment at the head of aarch64-sve.md for more details about the
9143      big-endian handling.  */
9144   if (BYTES_BIG_ENDIAN
9145       && reg_class_subset_p (rclass, FP_REGS)
9146       && !((REG_P (x) && HARD_REGISTER_P (x))
9147            || aarch64_simd_valid_immediate (x, NULL))
9148       && aarch64_sve_data_mode_p (mode))
9149     {
9150       sri->icode = CODE_FOR_aarch64_sve_reload_be;
9151       return NO_REGS;
9152     }
9153
9154   /* If we have to disable direct literal pool loads and stores because the
9155      function is too big, then we need a scratch register.  */
9156   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
9157       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
9158           || targetm.vector_mode_supported_p (GET_MODE (x)))
9159       && !aarch64_pcrelative_literal_loads)
9160     {
9161       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
9162       return NO_REGS;
9163     }
9164
9165   /* Without the TARGET_SIMD instructions we cannot move a Q register
9166      to a Q register directly.  We need a scratch.  */
9167   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
9168       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
9169       && reg_class_subset_p (rclass, FP_REGS))
9170     {
9171       sri->icode = code_for_aarch64_reload_mov (mode);
9172       return NO_REGS;
9173     }
9174
9175   /* A TFmode or TImode memory access should be handled via an FP_REGS
9176      because AArch64 has richer addressing modes for LDR/STR instructions
9177      than LDP/STP instructions.  */
9178   if (TARGET_FLOAT && rclass == GENERAL_REGS
9179       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
9180     return FP_REGS;
9181
9182   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
9183       return GENERAL_REGS;
9184
9185   return NO_REGS;
9186 }
9187
9188 static bool
9189 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
9190 {
9191   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
9192
9193   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
9194      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
9195   if (frame_pointer_needed)
9196     return to == HARD_FRAME_POINTER_REGNUM;
9197   return true;
9198 }
9199
9200 poly_int64
9201 aarch64_initial_elimination_offset (unsigned from, unsigned to)
9202 {
9203   if (to == HARD_FRAME_POINTER_REGNUM)
9204     {
9205       if (from == ARG_POINTER_REGNUM)
9206         return cfun->machine->frame.hard_fp_offset;
9207
9208       if (from == FRAME_POINTER_REGNUM)
9209         return cfun->machine->frame.hard_fp_offset
9210                - cfun->machine->frame.locals_offset;
9211     }
9212
9213   if (to == STACK_POINTER_REGNUM)
9214     {
9215       if (from == FRAME_POINTER_REGNUM)
9216           return cfun->machine->frame.frame_size
9217                  - cfun->machine->frame.locals_offset;
9218     }
9219
9220   return cfun->machine->frame.frame_size;
9221 }
9222
9223 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
9224    previous frame.  */
9225
9226 rtx
9227 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
9228 {
9229   if (count != 0)
9230     return const0_rtx;
9231   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
9232 }
9233
9234
9235 static void
9236 aarch64_asm_trampoline_template (FILE *f)
9237 {
9238   int offset1 = 16;
9239   int offset2 = 20;
9240
9241   if (aarch64_bti_enabled ())
9242     {
9243       asm_fprintf (f, "\thint\t34 // bti c\n");
9244       offset1 -= 4;
9245       offset2 -= 4;
9246     }
9247
9248   if (TARGET_ILP32)
9249     {
9250       asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
9251       asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
9252                    offset1);
9253     }
9254   else
9255     {
9256       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
9257       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
9258                    offset2);
9259     }
9260   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
9261
9262   /* The trampoline needs an extra padding instruction.  In case if BTI is
9263      enabled the padding instruction is replaced by the BTI instruction at
9264      the beginning.  */
9265   if (!aarch64_bti_enabled ())
9266     assemble_aligned_integer (4, const0_rtx);
9267
9268   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9269   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9270 }
9271
9272 static void
9273 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
9274 {
9275   rtx fnaddr, mem, a_tramp;
9276   const int tramp_code_sz = 16;
9277
9278   /* Don't need to copy the trailing D-words, we fill those in below.  */
9279   emit_block_move (m_tramp, assemble_trampoline_template (),
9280                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
9281   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
9282   fnaddr = XEXP (DECL_RTL (fndecl), 0);
9283   if (GET_MODE (fnaddr) != ptr_mode)
9284     fnaddr = convert_memory_address (ptr_mode, fnaddr);
9285   emit_move_insn (mem, fnaddr);
9286
9287   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
9288   emit_move_insn (mem, chain_value);
9289
9290   /* XXX We should really define a "clear_cache" pattern and use
9291      gen_clear_cache().  */
9292   a_tramp = XEXP (m_tramp, 0);
9293   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
9294                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
9295                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
9296                      ptr_mode);
9297 }
9298
9299 static unsigned char
9300 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
9301 {
9302   /* ??? Logically we should only need to provide a value when
9303      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
9304      can hold MODE, but at the moment we need to handle all modes.
9305      Just ignore any runtime parts for registers that can't store them.  */
9306   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
9307   unsigned int nregs;
9308   switch (regclass)
9309     {
9310     case TAILCALL_ADDR_REGS:
9311     case POINTER_REGS:
9312     case GENERAL_REGS:
9313     case ALL_REGS:
9314     case POINTER_AND_FP_REGS:
9315     case FP_REGS:
9316     case FP_LO_REGS:
9317     case FP_LO8_REGS:
9318       if (aarch64_sve_data_mode_p (mode)
9319           && constant_multiple_p (GET_MODE_SIZE (mode),
9320                                   BYTES_PER_SVE_VECTOR, &nregs))
9321         return nregs;
9322       return (aarch64_vector_data_mode_p (mode)
9323               ? CEIL (lowest_size, UNITS_PER_VREG)
9324               : CEIL (lowest_size, UNITS_PER_WORD));
9325     case STACK_REG:
9326     case PR_REGS:
9327     case PR_LO_REGS:
9328     case PR_HI_REGS:
9329       return 1;
9330
9331     case NO_REGS:
9332       return 0;
9333
9334     default:
9335       break;
9336     }
9337   gcc_unreachable ();
9338 }
9339
9340 static reg_class_t
9341 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
9342 {
9343   if (regclass == POINTER_REGS)
9344     return GENERAL_REGS;
9345
9346   if (regclass == STACK_REG)
9347     {
9348       if (REG_P(x)
9349           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
9350           return regclass;
9351
9352       return NO_REGS;
9353     }
9354
9355   /* Register eliminiation can result in a request for
9356      SP+constant->FP_REGS.  We cannot support such operations which
9357      use SP as source and an FP_REG as destination, so reject out
9358      right now.  */
9359   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
9360     {
9361       rtx lhs = XEXP (x, 0);
9362
9363       /* Look through a possible SUBREG introduced by ILP32.  */
9364       if (GET_CODE (lhs) == SUBREG)
9365         lhs = SUBREG_REG (lhs);
9366
9367       gcc_assert (REG_P (lhs));
9368       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
9369                                       POINTER_REGS));
9370       return NO_REGS;
9371     }
9372
9373   return regclass;
9374 }
9375
9376 void
9377 aarch64_asm_output_labelref (FILE* f, const char *name)
9378 {
9379   asm_fprintf (f, "%U%s", name);
9380 }
9381
9382 static void
9383 aarch64_elf_asm_constructor (rtx symbol, int priority)
9384 {
9385   if (priority == DEFAULT_INIT_PRIORITY)
9386     default_ctor_section_asm_out_constructor (symbol, priority);
9387   else
9388     {
9389       section *s;
9390       /* While priority is known to be in range [0, 65535], so 18 bytes
9391          would be enough, the compiler might not know that.  To avoid
9392          -Wformat-truncation false positive, use a larger size.  */
9393       char buf[23];
9394       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
9395       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9396       switch_to_section (s);
9397       assemble_align (POINTER_SIZE);
9398       assemble_aligned_integer (POINTER_BYTES, symbol);
9399     }
9400 }
9401
9402 static void
9403 aarch64_elf_asm_destructor (rtx symbol, int priority)
9404 {
9405   if (priority == DEFAULT_INIT_PRIORITY)
9406     default_dtor_section_asm_out_destructor (symbol, priority);
9407   else
9408     {
9409       section *s;
9410       /* While priority is known to be in range [0, 65535], so 18 bytes
9411          would be enough, the compiler might not know that.  To avoid
9412          -Wformat-truncation false positive, use a larger size.  */
9413       char buf[23];
9414       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
9415       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9416       switch_to_section (s);
9417       assemble_align (POINTER_SIZE);
9418       assemble_aligned_integer (POINTER_BYTES, symbol);
9419     }
9420 }
9421
9422 const char*
9423 aarch64_output_casesi (rtx *operands)
9424 {
9425   char buf[100];
9426   char label[100];
9427   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
9428   int index;
9429   static const char *const patterns[4][2] =
9430   {
9431     {
9432       "ldrb\t%w3, [%0,%w1,uxtw]",
9433       "add\t%3, %4, %w3, sxtb #2"
9434     },
9435     {
9436       "ldrh\t%w3, [%0,%w1,uxtw #1]",
9437       "add\t%3, %4, %w3, sxth #2"
9438     },
9439     {
9440       "ldr\t%w3, [%0,%w1,uxtw #2]",
9441       "add\t%3, %4, %w3, sxtw #2"
9442     },
9443     /* We assume that DImode is only generated when not optimizing and
9444        that we don't really need 64-bit address offsets.  That would
9445        imply an object file with 8GB of code in a single function!  */
9446     {
9447       "ldr\t%w3, [%0,%w1,uxtw #2]",
9448       "add\t%3, %4, %w3, sxtw #2"
9449     }
9450   };
9451
9452   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
9453
9454   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
9455   index = exact_log2 (GET_MODE_SIZE (mode));
9456
9457   gcc_assert (index >= 0 && index <= 3);
9458
9459   /* Need to implement table size reduction, by chaning the code below.  */
9460   output_asm_insn (patterns[index][0], operands);
9461   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
9462   snprintf (buf, sizeof (buf),
9463             "adr\t%%4, %s", targetm.strip_name_encoding (label));
9464   output_asm_insn (buf, operands);
9465   output_asm_insn (patterns[index][1], operands);
9466   output_asm_insn ("br\t%3", operands);
9467   assemble_label (asm_out_file, label);
9468   return "";
9469 }
9470
9471
9472 /* Return size in bits of an arithmetic operand which is shifted/scaled and
9473    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
9474    operator.  */
9475
9476 int
9477 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
9478 {
9479   if (shift >= 0 && shift <= 3)
9480     {
9481       int size;
9482       for (size = 8; size <= 32; size *= 2)
9483         {
9484           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
9485           if (mask == bits << shift)
9486             return size;
9487         }
9488     }
9489   return 0;
9490 }
9491
9492 /* Constant pools are per function only when PC relative
9493    literal loads are true or we are in the large memory
9494    model.  */
9495
9496 static inline bool
9497 aarch64_can_use_per_function_literal_pools_p (void)
9498 {
9499   return (aarch64_pcrelative_literal_loads
9500           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
9501 }
9502
9503 static bool
9504 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
9505 {
9506   /* We can't use blocks for constants when we're using a per-function
9507      constant pool.  */
9508   return !aarch64_can_use_per_function_literal_pools_p ();
9509 }
9510
9511 /* Select appropriate section for constants depending
9512    on where we place literal pools.  */
9513
9514 static section *
9515 aarch64_select_rtx_section (machine_mode mode,
9516                             rtx x,
9517                             unsigned HOST_WIDE_INT align)
9518 {
9519   if (aarch64_can_use_per_function_literal_pools_p ())
9520     return function_section (current_function_decl);
9521
9522   return default_elf_select_rtx_section (mode, x, align);
9523 }
9524
9525 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
9526 void
9527 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
9528                                   HOST_WIDE_INT offset)
9529 {
9530   /* When using per-function literal pools, we must ensure that any code
9531      section is aligned to the minimal instruction length, lest we get
9532      errors from the assembler re "unaligned instructions".  */
9533   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
9534     ASM_OUTPUT_ALIGN (f, 2);
9535 }
9536
9537 /* Costs.  */
9538
9539 /* Helper function for rtx cost calculation.  Strip a shift expression
9540    from X.  Returns the inner operand if successful, or the original
9541    expression on failure.  */
9542 static rtx
9543 aarch64_strip_shift (rtx x)
9544 {
9545   rtx op = x;
9546
9547   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
9548      we can convert both to ROR during final output.  */
9549   if ((GET_CODE (op) == ASHIFT
9550        || GET_CODE (op) == ASHIFTRT
9551        || GET_CODE (op) == LSHIFTRT
9552        || GET_CODE (op) == ROTATERT
9553        || GET_CODE (op) == ROTATE)
9554       && CONST_INT_P (XEXP (op, 1)))
9555     return XEXP (op, 0);
9556
9557   if (GET_CODE (op) == MULT
9558       && CONST_INT_P (XEXP (op, 1))
9559       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
9560     return XEXP (op, 0);
9561
9562   return x;
9563 }
9564
9565 /* Helper function for rtx cost calculation.  Strip an extend
9566    expression from X.  Returns the inner operand if successful, or the
9567    original expression on failure.  We deal with a number of possible
9568    canonicalization variations here. If STRIP_SHIFT is true, then
9569    we can strip off a shift also.  */
9570 static rtx
9571 aarch64_strip_extend (rtx x, bool strip_shift)
9572 {
9573   scalar_int_mode mode;
9574   rtx op = x;
9575
9576   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
9577     return op;
9578
9579   /* Zero and sign extraction of a widened value.  */
9580   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
9581       && XEXP (op, 2) == const0_rtx
9582       && GET_CODE (XEXP (op, 0)) == MULT
9583       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
9584                                          XEXP (op, 1)))
9585     return XEXP (XEXP (op, 0), 0);
9586
9587   /* It can also be represented (for zero-extend) as an AND with an
9588      immediate.  */
9589   if (GET_CODE (op) == AND
9590       && GET_CODE (XEXP (op, 0)) == MULT
9591       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
9592       && CONST_INT_P (XEXP (op, 1))
9593       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
9594                            INTVAL (XEXP (op, 1))) != 0)
9595     return XEXP (XEXP (op, 0), 0);
9596
9597   /* Now handle extended register, as this may also have an optional
9598      left shift by 1..4.  */
9599   if (strip_shift
9600       && GET_CODE (op) == ASHIFT
9601       && CONST_INT_P (XEXP (op, 1))
9602       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
9603     op = XEXP (op, 0);
9604
9605   if (GET_CODE (op) == ZERO_EXTEND
9606       || GET_CODE (op) == SIGN_EXTEND)
9607     op = XEXP (op, 0);
9608
9609   if (op != x)
9610     return op;
9611
9612   return x;
9613 }
9614
9615 /* Return true iff CODE is a shift supported in combination
9616    with arithmetic instructions.  */
9617
9618 static bool
9619 aarch64_shift_p (enum rtx_code code)
9620 {
9621   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
9622 }
9623
9624
9625 /* Return true iff X is a cheap shift without a sign extend. */
9626
9627 static bool
9628 aarch64_cheap_mult_shift_p (rtx x)
9629 {
9630   rtx op0, op1;
9631
9632   op0 = XEXP (x, 0);
9633   op1 = XEXP (x, 1);
9634
9635   if (!(aarch64_tune_params.extra_tuning_flags
9636                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
9637     return false;
9638
9639   if (GET_CODE (op0) == SIGN_EXTEND)
9640     return false;
9641
9642   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
9643       && UINTVAL (op1) <= 4)
9644     return true;
9645
9646   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
9647     return false;
9648
9649   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
9650
9651   if (l2 > 0 && l2 <= 4)
9652     return true;
9653
9654   return false;
9655 }
9656
9657 /* Helper function for rtx cost calculation.  Calculate the cost of
9658    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
9659    Return the calculated cost of the expression, recursing manually in to
9660    operands where needed.  */
9661
9662 static int
9663 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
9664 {
9665   rtx op0, op1;
9666   const struct cpu_cost_table *extra_cost
9667     = aarch64_tune_params.insn_extra_cost;
9668   int cost = 0;
9669   bool compound_p = (outer == PLUS || outer == MINUS);
9670   machine_mode mode = GET_MODE (x);
9671
9672   gcc_checking_assert (code == MULT);
9673
9674   op0 = XEXP (x, 0);
9675   op1 = XEXP (x, 1);
9676
9677   if (VECTOR_MODE_P (mode))
9678     mode = GET_MODE_INNER (mode);
9679
9680   /* Integer multiply/fma.  */
9681   if (GET_MODE_CLASS (mode) == MODE_INT)
9682     {
9683       /* The multiply will be canonicalized as a shift, cost it as such.  */
9684       if (aarch64_shift_p (GET_CODE (x))
9685           || (CONST_INT_P (op1)
9686               && exact_log2 (INTVAL (op1)) > 0))
9687         {
9688           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
9689                            || GET_CODE (op0) == SIGN_EXTEND;
9690           if (speed)
9691             {
9692               if (compound_p)
9693                 {
9694                   /* If the shift is considered cheap,
9695                      then don't add any cost. */
9696                   if (aarch64_cheap_mult_shift_p (x))
9697                     ;
9698                   else if (REG_P (op1))
9699                     /* ARITH + shift-by-register.  */
9700                     cost += extra_cost->alu.arith_shift_reg;
9701                   else if (is_extend)
9702                     /* ARITH + extended register.  We don't have a cost field
9703                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
9704                     cost += extra_cost->alu.extend_arith;
9705                   else
9706                     /* ARITH + shift-by-immediate.  */
9707                     cost += extra_cost->alu.arith_shift;
9708                 }
9709               else
9710                 /* LSL (immediate).  */
9711                 cost += extra_cost->alu.shift;
9712
9713             }
9714           /* Strip extends as we will have costed them in the case above.  */
9715           if (is_extend)
9716             op0 = aarch64_strip_extend (op0, true);
9717
9718           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
9719
9720           return cost;
9721         }
9722
9723       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
9724          compound and let the below cases handle it.  After all, MNEG is a
9725          special-case alias of MSUB.  */
9726       if (GET_CODE (op0) == NEG)
9727         {
9728           op0 = XEXP (op0, 0);
9729           compound_p = true;
9730         }
9731
9732       /* Integer multiplies or FMAs have zero/sign extending variants.  */
9733       if ((GET_CODE (op0) == ZERO_EXTEND
9734            && GET_CODE (op1) == ZERO_EXTEND)
9735           || (GET_CODE (op0) == SIGN_EXTEND
9736               && GET_CODE (op1) == SIGN_EXTEND))
9737         {
9738           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
9739           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
9740
9741           if (speed)
9742             {
9743               if (compound_p)
9744                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
9745                 cost += extra_cost->mult[0].extend_add;
9746               else
9747                 /* MUL/SMULL/UMULL.  */
9748                 cost += extra_cost->mult[0].extend;
9749             }
9750
9751           return cost;
9752         }
9753
9754       /* This is either an integer multiply or a MADD.  In both cases
9755          we want to recurse and cost the operands.  */
9756       cost += rtx_cost (op0, mode, MULT, 0, speed);
9757       cost += rtx_cost (op1, mode, MULT, 1, speed);
9758
9759       if (speed)
9760         {
9761           if (compound_p)
9762             /* MADD/MSUB.  */
9763             cost += extra_cost->mult[mode == DImode].add;
9764           else
9765             /* MUL.  */
9766             cost += extra_cost->mult[mode == DImode].simple;
9767         }
9768
9769       return cost;
9770     }
9771   else
9772     {
9773       if (speed)
9774         {
9775           /* Floating-point FMA/FMUL can also support negations of the
9776              operands, unless the rounding mode is upward or downward in
9777              which case FNMUL is different than FMUL with operand negation.  */
9778           bool neg0 = GET_CODE (op0) == NEG;
9779           bool neg1 = GET_CODE (op1) == NEG;
9780           if (compound_p || !flag_rounding_math || (neg0 && neg1))
9781             {
9782               if (neg0)
9783                 op0 = XEXP (op0, 0);
9784               if (neg1)
9785                 op1 = XEXP (op1, 0);
9786             }
9787
9788           if (compound_p)
9789             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
9790             cost += extra_cost->fp[mode == DFmode].fma;
9791           else
9792             /* FMUL/FNMUL.  */
9793             cost += extra_cost->fp[mode == DFmode].mult;
9794         }
9795
9796       cost += rtx_cost (op0, mode, MULT, 0, speed);
9797       cost += rtx_cost (op1, mode, MULT, 1, speed);
9798       return cost;
9799     }
9800 }
9801
9802 static int
9803 aarch64_address_cost (rtx x,
9804                       machine_mode mode,
9805                       addr_space_t as ATTRIBUTE_UNUSED,
9806                       bool speed)
9807 {
9808   enum rtx_code c = GET_CODE (x);
9809   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
9810   struct aarch64_address_info info;
9811   int cost = 0;
9812   info.shift = 0;
9813
9814   if (!aarch64_classify_address (&info, x, mode, false))
9815     {
9816       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
9817         {
9818           /* This is a CONST or SYMBOL ref which will be split
9819              in a different way depending on the code model in use.
9820              Cost it through the generic infrastructure.  */
9821           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
9822           /* Divide through by the cost of one instruction to
9823              bring it to the same units as the address costs.  */
9824           cost_symbol_ref /= COSTS_N_INSNS (1);
9825           /* The cost is then the cost of preparing the address,
9826              followed by an immediate (possibly 0) offset.  */
9827           return cost_symbol_ref + addr_cost->imm_offset;
9828         }
9829       else
9830         {
9831           /* This is most likely a jump table from a case
9832              statement.  */
9833           return addr_cost->register_offset;
9834         }
9835     }
9836
9837   switch (info.type)
9838     {
9839       case ADDRESS_LO_SUM:
9840       case ADDRESS_SYMBOLIC:
9841       case ADDRESS_REG_IMM:
9842         cost += addr_cost->imm_offset;
9843         break;
9844
9845       case ADDRESS_REG_WB:
9846         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
9847           cost += addr_cost->pre_modify;
9848         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
9849           cost += addr_cost->post_modify;
9850         else
9851           gcc_unreachable ();
9852
9853         break;
9854
9855       case ADDRESS_REG_REG:
9856         cost += addr_cost->register_offset;
9857         break;
9858
9859       case ADDRESS_REG_SXTW:
9860         cost += addr_cost->register_sextend;
9861         break;
9862
9863       case ADDRESS_REG_UXTW:
9864         cost += addr_cost->register_zextend;
9865         break;
9866
9867       default:
9868         gcc_unreachable ();
9869     }
9870
9871
9872   if (info.shift > 0)
9873     {
9874       /* For the sake of calculating the cost of the shifted register
9875          component, we can treat same sized modes in the same way.  */
9876       if (known_eq (GET_MODE_BITSIZE (mode), 16))
9877         cost += addr_cost->addr_scale_costs.hi;
9878       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9879         cost += addr_cost->addr_scale_costs.si;
9880       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9881         cost += addr_cost->addr_scale_costs.di;
9882       else
9883         /* We can't tell, or this is a 128-bit vector.  */
9884         cost += addr_cost->addr_scale_costs.ti;
9885     }
9886
9887   return cost;
9888 }
9889
9890 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
9891    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
9892    to be taken.  */
9893
9894 int
9895 aarch64_branch_cost (bool speed_p, bool predictable_p)
9896 {
9897   /* When optimizing for speed, use the cost of unpredictable branches.  */
9898   const struct cpu_branch_cost *branch_costs =
9899     aarch64_tune_params.branch_costs;
9900
9901   if (!speed_p || predictable_p)
9902     return branch_costs->predictable;
9903   else
9904     return branch_costs->unpredictable;
9905 }
9906
9907 /* Return true if the RTX X in mode MODE is a zero or sign extract
9908    usable in an ADD or SUB (extended register) instruction.  */
9909 static bool
9910 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
9911 {
9912   /* Catch add with a sign extract.
9913      This is add_<optab><mode>_multp2.  */
9914   if (GET_CODE (x) == SIGN_EXTRACT
9915       || GET_CODE (x) == ZERO_EXTRACT)
9916     {
9917       rtx op0 = XEXP (x, 0);
9918       rtx op1 = XEXP (x, 1);
9919       rtx op2 = XEXP (x, 2);
9920
9921       if (GET_CODE (op0) == MULT
9922           && CONST_INT_P (op1)
9923           && op2 == const0_rtx
9924           && CONST_INT_P (XEXP (op0, 1))
9925           && aarch64_is_extend_from_extract (mode,
9926                                              XEXP (op0, 1),
9927                                              op1))
9928         {
9929           return true;
9930         }
9931     }
9932   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
9933      No shift.  */
9934   else if (GET_CODE (x) == SIGN_EXTEND
9935            || GET_CODE (x) == ZERO_EXTEND)
9936     return REG_P (XEXP (x, 0));
9937
9938   return false;
9939 }
9940
9941 static bool
9942 aarch64_frint_unspec_p (unsigned int u)
9943 {
9944   switch (u)
9945     {
9946       case UNSPEC_FRINTZ:
9947       case UNSPEC_FRINTP:
9948       case UNSPEC_FRINTM:
9949       case UNSPEC_FRINTA:
9950       case UNSPEC_FRINTN:
9951       case UNSPEC_FRINTX:
9952       case UNSPEC_FRINTI:
9953         return true;
9954
9955       default:
9956         return false;
9957     }
9958 }
9959
9960 /* Return true iff X is an rtx that will match an extr instruction
9961    i.e. as described in the *extr<mode>5_insn family of patterns.
9962    OP0 and OP1 will be set to the operands of the shifts involved
9963    on success and will be NULL_RTX otherwise.  */
9964
9965 static bool
9966 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
9967 {
9968   rtx op0, op1;
9969   scalar_int_mode mode;
9970   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
9971     return false;
9972
9973   *res_op0 = NULL_RTX;
9974   *res_op1 = NULL_RTX;
9975
9976   if (GET_CODE (x) != IOR)
9977     return false;
9978
9979   op0 = XEXP (x, 0);
9980   op1 = XEXP (x, 1);
9981
9982   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
9983       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
9984     {
9985      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
9986       if (GET_CODE (op1) == ASHIFT)
9987         std::swap (op0, op1);
9988
9989       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
9990         return false;
9991
9992       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
9993       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
9994
9995       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
9996           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
9997         {
9998           *res_op0 = XEXP (op0, 0);
9999           *res_op1 = XEXP (op1, 0);
10000           return true;
10001         }
10002     }
10003
10004   return false;
10005 }
10006
10007 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
10008    storing it in *COST.  Result is true if the total cost of the operation
10009    has now been calculated.  */
10010 static bool
10011 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
10012 {
10013   rtx inner;
10014   rtx comparator;
10015   enum rtx_code cmpcode;
10016
10017   if (COMPARISON_P (op0))
10018     {
10019       inner = XEXP (op0, 0);
10020       comparator = XEXP (op0, 1);
10021       cmpcode = GET_CODE (op0);
10022     }
10023   else
10024     {
10025       inner = op0;
10026       comparator = const0_rtx;
10027       cmpcode = NE;
10028     }
10029
10030   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
10031     {
10032       /* Conditional branch.  */
10033       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10034         return true;
10035       else
10036         {
10037           if (cmpcode == NE || cmpcode == EQ)
10038             {
10039               if (comparator == const0_rtx)
10040                 {
10041                   /* TBZ/TBNZ/CBZ/CBNZ.  */
10042                   if (GET_CODE (inner) == ZERO_EXTRACT)
10043                     /* TBZ/TBNZ.  */
10044                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
10045                                        ZERO_EXTRACT, 0, speed);
10046                   else
10047                     /* CBZ/CBNZ.  */
10048                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
10049
10050                 return true;
10051               }
10052             }
10053           else if (cmpcode == LT || cmpcode == GE)
10054             {
10055               /* TBZ/TBNZ.  */
10056               if (comparator == const0_rtx)
10057                 return true;
10058             }
10059         }
10060     }
10061   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10062     {
10063       /* CCMP.  */
10064       if (GET_CODE (op1) == COMPARE)
10065         {
10066           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
10067           if (XEXP (op1, 1) == const0_rtx)
10068             *cost += 1;
10069           if (speed)
10070             {
10071               machine_mode mode = GET_MODE (XEXP (op1, 0));
10072               const struct cpu_cost_table *extra_cost
10073                 = aarch64_tune_params.insn_extra_cost;
10074
10075               if (GET_MODE_CLASS (mode) == MODE_INT)
10076                 *cost += extra_cost->alu.arith;
10077               else
10078                 *cost += extra_cost->fp[mode == DFmode].compare;
10079             }
10080           return true;
10081         }
10082
10083       /* It's a conditional operation based on the status flags,
10084          so it must be some flavor of CSEL.  */
10085
10086       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
10087       if (GET_CODE (op1) == NEG
10088           || GET_CODE (op1) == NOT
10089           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
10090         op1 = XEXP (op1, 0);
10091       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
10092         {
10093           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
10094           op1 = XEXP (op1, 0);
10095           op2 = XEXP (op2, 0);
10096         }
10097
10098       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
10099       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
10100       return true;
10101     }
10102
10103   /* We don't know what this is, cost all operands.  */
10104   return false;
10105 }
10106
10107 /* Check whether X is a bitfield operation of the form shift + extend that
10108    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
10109    operand to which the bitfield operation is applied.  Otherwise return
10110    NULL_RTX.  */
10111
10112 static rtx
10113 aarch64_extend_bitfield_pattern_p (rtx x)
10114 {
10115   rtx_code outer_code = GET_CODE (x);
10116   machine_mode outer_mode = GET_MODE (x);
10117
10118   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
10119       && outer_mode != SImode && outer_mode != DImode)
10120     return NULL_RTX;
10121
10122   rtx inner = XEXP (x, 0);
10123   rtx_code inner_code = GET_CODE (inner);
10124   machine_mode inner_mode = GET_MODE (inner);
10125   rtx op = NULL_RTX;
10126
10127   switch (inner_code)
10128     {
10129       case ASHIFT:
10130         if (CONST_INT_P (XEXP (inner, 1))
10131             && (inner_mode == QImode || inner_mode == HImode))
10132           op = XEXP (inner, 0);
10133         break;
10134       case LSHIFTRT:
10135         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
10136             && (inner_mode == QImode || inner_mode == HImode))
10137           op = XEXP (inner, 0);
10138         break;
10139       case ASHIFTRT:
10140         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
10141             && (inner_mode == QImode || inner_mode == HImode))
10142           op = XEXP (inner, 0);
10143         break;
10144       default:
10145         break;
10146     }
10147
10148   return op;
10149 }
10150
10151 /* Return true if the mask and a shift amount from an RTX of the form
10152    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
10153    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
10154
10155 bool
10156 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
10157                                     rtx shft_amnt)
10158 {
10159   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
10160          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
10161          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
10162          && (INTVAL (mask)
10163              & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
10164 }
10165
10166 /* Return true if the masks and a shift amount from an RTX of the form
10167    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
10168    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
10169
10170 bool
10171 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
10172                                    unsigned HOST_WIDE_INT mask1,
10173                                    unsigned HOST_WIDE_INT shft_amnt,
10174                                    unsigned HOST_WIDE_INT mask2)
10175 {
10176   unsigned HOST_WIDE_INT t;
10177
10178   /* Verify that there is no overlap in what bits are set in the two masks.  */
10179   if (mask1 != ~mask2)
10180     return false;
10181
10182   /* Verify that mask2 is not all zeros or ones.  */
10183   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
10184     return false;
10185
10186   /* The shift amount should always be less than the mode size.  */
10187   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
10188
10189   /* Verify that the mask being shifted is contiguous and would be in the
10190      least significant bits after shifting by shft_amnt.  */
10191   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
10192   return (t == (t & -t));
10193 }
10194
10195 /* Calculate the cost of calculating X, storing it in *COST.  Result
10196    is true if the total cost of the operation has now been calculated.  */
10197 static bool
10198 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
10199                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
10200 {
10201   rtx op0, op1, op2;
10202   const struct cpu_cost_table *extra_cost
10203     = aarch64_tune_params.insn_extra_cost;
10204   int code = GET_CODE (x);
10205   scalar_int_mode int_mode;
10206
10207   /* By default, assume that everything has equivalent cost to the
10208      cheapest instruction.  Any additional costs are applied as a delta
10209      above this default.  */
10210   *cost = COSTS_N_INSNS (1);
10211
10212   switch (code)
10213     {
10214     case SET:
10215       /* The cost depends entirely on the operands to SET.  */
10216       *cost = 0;
10217       op0 = SET_DEST (x);
10218       op1 = SET_SRC (x);
10219
10220       switch (GET_CODE (op0))
10221         {
10222         case MEM:
10223           if (speed)
10224             {
10225               rtx address = XEXP (op0, 0);
10226               if (VECTOR_MODE_P (mode))
10227                 *cost += extra_cost->ldst.storev;
10228               else if (GET_MODE_CLASS (mode) == MODE_INT)
10229                 *cost += extra_cost->ldst.store;
10230               else if (mode == SFmode)
10231                 *cost += extra_cost->ldst.storef;
10232               else if (mode == DFmode)
10233                 *cost += extra_cost->ldst.stored;
10234
10235               *cost +=
10236                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10237                                                      0, speed));
10238             }
10239
10240           *cost += rtx_cost (op1, mode, SET, 1, speed);
10241           return true;
10242
10243         case SUBREG:
10244           if (! REG_P (SUBREG_REG (op0)))
10245             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
10246
10247           /* Fall through.  */
10248         case REG:
10249           /* The cost is one per vector-register copied.  */
10250           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
10251             {
10252               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
10253               *cost = COSTS_N_INSNS (nregs);
10254             }
10255           /* const0_rtx is in general free, but we will use an
10256              instruction to set a register to 0.  */
10257           else if (REG_P (op1) || op1 == const0_rtx)
10258             {
10259               /* The cost is 1 per register copied.  */
10260               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
10261               *cost = COSTS_N_INSNS (nregs);
10262             }
10263           else
10264             /* Cost is just the cost of the RHS of the set.  */
10265             *cost += rtx_cost (op1, mode, SET, 1, speed);
10266           return true;
10267
10268         case ZERO_EXTRACT:
10269         case SIGN_EXTRACT:
10270           /* Bit-field insertion.  Strip any redundant widening of
10271              the RHS to meet the width of the target.  */
10272           if (GET_CODE (op1) == SUBREG)
10273             op1 = SUBREG_REG (op1);
10274           if ((GET_CODE (op1) == ZERO_EXTEND
10275                || GET_CODE (op1) == SIGN_EXTEND)
10276               && CONST_INT_P (XEXP (op0, 1))
10277               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
10278               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
10279             op1 = XEXP (op1, 0);
10280
10281           if (CONST_INT_P (op1))
10282             {
10283               /* MOV immediate is assumed to always be cheap.  */
10284               *cost = COSTS_N_INSNS (1);
10285             }
10286           else
10287             {
10288               /* BFM.  */
10289               if (speed)
10290                 *cost += extra_cost->alu.bfi;
10291               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
10292             }
10293
10294           return true;
10295
10296         default:
10297           /* We can't make sense of this, assume default cost.  */
10298           *cost = COSTS_N_INSNS (1);
10299           return false;
10300         }
10301       return false;
10302
10303     case CONST_INT:
10304       /* If an instruction can incorporate a constant within the
10305          instruction, the instruction's expression avoids calling
10306          rtx_cost() on the constant.  If rtx_cost() is called on a
10307          constant, then it is usually because the constant must be
10308          moved into a register by one or more instructions.
10309
10310          The exception is constant 0, which can be expressed
10311          as XZR/WZR and is therefore free.  The exception to this is
10312          if we have (set (reg) (const0_rtx)) in which case we must cost
10313          the move.  However, we can catch that when we cost the SET, so
10314          we don't need to consider that here.  */
10315       if (x == const0_rtx)
10316         *cost = 0;
10317       else
10318         {
10319           /* To an approximation, building any other constant is
10320              proportionally expensive to the number of instructions
10321              required to build that constant.  This is true whether we
10322              are compiling for SPEED or otherwise.  */
10323           if (!is_a <scalar_int_mode> (mode, &int_mode))
10324             int_mode = word_mode;
10325           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
10326                                  (NULL_RTX, x, false, int_mode));
10327         }
10328       return true;
10329
10330     case CONST_DOUBLE:
10331
10332       /* First determine number of instructions to do the move
10333           as an integer constant.  */
10334       if (!aarch64_float_const_representable_p (x)
10335            && !aarch64_can_const_movi_rtx_p (x, mode)
10336            && aarch64_float_const_rtx_p (x))
10337         {
10338           unsigned HOST_WIDE_INT ival;
10339           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
10340           gcc_assert (succeed);
10341
10342           scalar_int_mode imode = (mode == HFmode
10343                                    ? SImode
10344                                    : int_mode_for_mode (mode).require ());
10345           int ncost = aarch64_internal_mov_immediate
10346                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
10347           *cost += COSTS_N_INSNS (ncost);
10348           return true;
10349         }
10350
10351       if (speed)
10352         {
10353           /* mov[df,sf]_aarch64.  */
10354           if (aarch64_float_const_representable_p (x))
10355             /* FMOV (scalar immediate).  */
10356             *cost += extra_cost->fp[mode == DFmode].fpconst;
10357           else if (!aarch64_float_const_zero_rtx_p (x))
10358             {
10359               /* This will be a load from memory.  */
10360               if (mode == DFmode)
10361                 *cost += extra_cost->ldst.loadd;
10362               else
10363                 *cost += extra_cost->ldst.loadf;
10364             }
10365           else
10366             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
10367                or MOV v0.s[0], wzr - neither of which are modeled by the
10368                cost tables.  Just use the default cost.  */
10369             {
10370             }
10371         }
10372
10373       return true;
10374
10375     case MEM:
10376       if (speed)
10377         {
10378           /* For loads we want the base cost of a load, plus an
10379              approximation for the additional cost of the addressing
10380              mode.  */
10381           rtx address = XEXP (x, 0);
10382           if (VECTOR_MODE_P (mode))
10383             *cost += extra_cost->ldst.loadv;
10384           else if (GET_MODE_CLASS (mode) == MODE_INT)
10385             *cost += extra_cost->ldst.load;
10386           else if (mode == SFmode)
10387             *cost += extra_cost->ldst.loadf;
10388           else if (mode == DFmode)
10389             *cost += extra_cost->ldst.loadd;
10390
10391           *cost +=
10392                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10393                                                      0, speed));
10394         }
10395
10396       return true;
10397
10398     case NEG:
10399       op0 = XEXP (x, 0);
10400
10401       if (VECTOR_MODE_P (mode))
10402         {
10403           if (speed)
10404             {
10405               /* FNEG.  */
10406               *cost += extra_cost->vect.alu;
10407             }
10408           return false;
10409         }
10410
10411       if (GET_MODE_CLASS (mode) == MODE_INT)
10412         {
10413           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10414               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10415             {
10416               /* CSETM.  */
10417               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
10418               return true;
10419             }
10420
10421           /* Cost this as SUB wzr, X.  */
10422           op0 = CONST0_RTX (mode);
10423           op1 = XEXP (x, 0);
10424           goto cost_minus;
10425         }
10426
10427       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10428         {
10429           /* Support (neg(fma...)) as a single instruction only if
10430              sign of zeros is unimportant.  This matches the decision
10431              making in aarch64.md.  */
10432           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
10433             {
10434               /* FNMADD.  */
10435               *cost = rtx_cost (op0, mode, NEG, 0, speed);
10436               return true;
10437             }
10438           if (GET_CODE (op0) == MULT)
10439             {
10440               /* FNMUL.  */
10441               *cost = rtx_cost (op0, mode, NEG, 0, speed);
10442               return true;
10443             }
10444           if (speed)
10445             /* FNEG.  */
10446             *cost += extra_cost->fp[mode == DFmode].neg;
10447           return false;
10448         }
10449
10450       return false;
10451
10452     case CLRSB:
10453     case CLZ:
10454       if (speed)
10455         {
10456           if (VECTOR_MODE_P (mode))
10457             *cost += extra_cost->vect.alu;
10458           else
10459             *cost += extra_cost->alu.clz;
10460         }
10461
10462       return false;
10463
10464     case COMPARE:
10465       op0 = XEXP (x, 0);
10466       op1 = XEXP (x, 1);
10467
10468       if (op1 == const0_rtx
10469           && GET_CODE (op0) == AND)
10470         {
10471           x = op0;
10472           mode = GET_MODE (op0);
10473           goto cost_logic;
10474         }
10475
10476       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
10477         {
10478           /* TODO: A write to the CC flags possibly costs extra, this
10479              needs encoding in the cost tables.  */
10480
10481           mode = GET_MODE (op0);
10482           /* ANDS.  */
10483           if (GET_CODE (op0) == AND)
10484             {
10485               x = op0;
10486               goto cost_logic;
10487             }
10488
10489           if (GET_CODE (op0) == PLUS)
10490             {
10491               /* ADDS (and CMN alias).  */
10492               x = op0;
10493               goto cost_plus;
10494             }
10495
10496           if (GET_CODE (op0) == MINUS)
10497             {
10498               /* SUBS.  */
10499               x = op0;
10500               goto cost_minus;
10501             }
10502
10503           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
10504               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
10505               && CONST_INT_P (XEXP (op0, 2)))
10506             {
10507               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
10508                  Handle it here directly rather than going to cost_logic
10509                  since we know the immediate generated for the TST is valid
10510                  so we can avoid creating an intermediate rtx for it only
10511                  for costing purposes.  */
10512               if (speed)
10513                 *cost += extra_cost->alu.logical;
10514
10515               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
10516                                  ZERO_EXTRACT, 0, speed);
10517               return true;
10518             }
10519
10520           if (GET_CODE (op1) == NEG)
10521             {
10522               /* CMN.  */
10523               if (speed)
10524                 *cost += extra_cost->alu.arith;
10525
10526               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
10527               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
10528               return true;
10529             }
10530
10531           /* CMP.
10532
10533              Compare can freely swap the order of operands, and
10534              canonicalization puts the more complex operation first.
10535              But the integer MINUS logic expects the shift/extend
10536              operation in op1.  */
10537           if (! (REG_P (op0)
10538                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
10539           {
10540             op0 = XEXP (x, 1);
10541             op1 = XEXP (x, 0);
10542           }
10543           goto cost_minus;
10544         }
10545
10546       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
10547         {
10548           /* FCMP.  */
10549           if (speed)
10550             *cost += extra_cost->fp[mode == DFmode].compare;
10551
10552           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
10553             {
10554               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
10555               /* FCMP supports constant 0.0 for no extra cost. */
10556               return true;
10557             }
10558           return false;
10559         }
10560
10561       if (VECTOR_MODE_P (mode))
10562         {
10563           /* Vector compare.  */
10564           if (speed)
10565             *cost += extra_cost->vect.alu;
10566
10567           if (aarch64_float_const_zero_rtx_p (op1))
10568             {
10569               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
10570                  cost.  */
10571               return true;
10572             }
10573           return false;
10574         }
10575       return false;
10576
10577     case MINUS:
10578       {
10579         op0 = XEXP (x, 0);
10580         op1 = XEXP (x, 1);
10581
10582 cost_minus:
10583         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
10584
10585         /* Detect valid immediates.  */
10586         if ((GET_MODE_CLASS (mode) == MODE_INT
10587              || (GET_MODE_CLASS (mode) == MODE_CC
10588                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
10589             && CONST_INT_P (op1)
10590             && aarch64_uimm12_shift (INTVAL (op1)))
10591           {
10592             if (speed)
10593               /* SUB(S) (immediate).  */
10594               *cost += extra_cost->alu.arith;
10595             return true;
10596           }
10597
10598         /* Look for SUB (extended register).  */
10599         if (is_a <scalar_int_mode> (mode, &int_mode)
10600             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
10601           {
10602             if (speed)
10603               *cost += extra_cost->alu.extend_arith;
10604
10605             op1 = aarch64_strip_extend (op1, true);
10606             *cost += rtx_cost (op1, VOIDmode,
10607                                (enum rtx_code) GET_CODE (op1), 0, speed);
10608             return true;
10609           }
10610
10611         rtx new_op1 = aarch64_strip_extend (op1, false);
10612
10613         /* Cost this as an FMA-alike operation.  */
10614         if ((GET_CODE (new_op1) == MULT
10615              || aarch64_shift_p (GET_CODE (new_op1)))
10616             && code != COMPARE)
10617           {
10618             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
10619                                             (enum rtx_code) code,
10620                                             speed);
10621             return true;
10622           }
10623
10624         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
10625
10626         if (speed)
10627           {
10628             if (VECTOR_MODE_P (mode))
10629               {
10630                 /* Vector SUB.  */
10631                 *cost += extra_cost->vect.alu;
10632               }
10633             else if (GET_MODE_CLASS (mode) == MODE_INT)
10634               {
10635                 /* SUB(S).  */
10636                 *cost += extra_cost->alu.arith;
10637               }
10638             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10639               {
10640                 /* FSUB.  */
10641                 *cost += extra_cost->fp[mode == DFmode].addsub;
10642               }
10643           }
10644         return true;
10645       }
10646
10647     case PLUS:
10648       {
10649         rtx new_op0;
10650
10651         op0 = XEXP (x, 0);
10652         op1 = XEXP (x, 1);
10653
10654 cost_plus:
10655         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10656             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10657           {
10658             /* CSINC.  */
10659             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
10660             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10661             return true;
10662           }
10663
10664         if (GET_MODE_CLASS (mode) == MODE_INT
10665             && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
10666                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
10667           {
10668             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
10669
10670             if (speed)
10671               /* ADD (immediate).  */
10672               *cost += extra_cost->alu.arith;
10673             return true;
10674           }
10675
10676         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10677
10678         /* Look for ADD (extended register).  */
10679         if (is_a <scalar_int_mode> (mode, &int_mode)
10680             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
10681           {
10682             if (speed)
10683               *cost += extra_cost->alu.extend_arith;
10684
10685             op0 = aarch64_strip_extend (op0, true);
10686             *cost += rtx_cost (op0, VOIDmode,
10687                                (enum rtx_code) GET_CODE (op0), 0, speed);
10688             return true;
10689           }
10690
10691         /* Strip any extend, leave shifts behind as we will
10692            cost them through mult_cost.  */
10693         new_op0 = aarch64_strip_extend (op0, false);
10694
10695         if (GET_CODE (new_op0) == MULT
10696             || aarch64_shift_p (GET_CODE (new_op0)))
10697           {
10698             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
10699                                             speed);
10700             return true;
10701           }
10702
10703         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
10704
10705         if (speed)
10706           {
10707             if (VECTOR_MODE_P (mode))
10708               {
10709                 /* Vector ADD.  */
10710                 *cost += extra_cost->vect.alu;
10711               }
10712             else if (GET_MODE_CLASS (mode) == MODE_INT)
10713               {
10714                 /* ADD.  */
10715                 *cost += extra_cost->alu.arith;
10716               }
10717             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10718               {
10719                 /* FADD.  */
10720                 *cost += extra_cost->fp[mode == DFmode].addsub;
10721               }
10722           }
10723         return true;
10724       }
10725
10726     case BSWAP:
10727       *cost = COSTS_N_INSNS (1);
10728
10729       if (speed)
10730         {
10731           if (VECTOR_MODE_P (mode))
10732             *cost += extra_cost->vect.alu;
10733           else
10734             *cost += extra_cost->alu.rev;
10735         }
10736       return false;
10737
10738     case IOR:
10739       if (aarch_rev16_p (x))
10740         {
10741           *cost = COSTS_N_INSNS (1);
10742
10743           if (speed)
10744             {
10745               if (VECTOR_MODE_P (mode))
10746                 *cost += extra_cost->vect.alu;
10747               else
10748                 *cost += extra_cost->alu.rev;
10749             }
10750           return true;
10751         }
10752
10753       if (aarch64_extr_rtx_p (x, &op0, &op1))
10754         {
10755           *cost += rtx_cost (op0, mode, IOR, 0, speed);
10756           *cost += rtx_cost (op1, mode, IOR, 1, speed);
10757           if (speed)
10758             *cost += extra_cost->alu.shift;
10759
10760           return true;
10761         }
10762     /* Fall through.  */
10763     case XOR:
10764     case AND:
10765     cost_logic:
10766       op0 = XEXP (x, 0);
10767       op1 = XEXP (x, 1);
10768
10769       if (VECTOR_MODE_P (mode))
10770         {
10771           if (speed)
10772             *cost += extra_cost->vect.alu;
10773           return true;
10774         }
10775
10776       if (code == AND
10777           && GET_CODE (op0) == MULT
10778           && CONST_INT_P (XEXP (op0, 1))
10779           && CONST_INT_P (op1)
10780           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
10781                                INTVAL (op1)) != 0)
10782         {
10783           /* This is a UBFM/SBFM.  */
10784           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
10785           if (speed)
10786             *cost += extra_cost->alu.bfx;
10787           return true;
10788         }
10789
10790       if (is_int_mode (mode, &int_mode))
10791         {
10792           if (CONST_INT_P (op1))
10793             {
10794               /* We have a mask + shift version of a UBFIZ
10795                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
10796               if (GET_CODE (op0) == ASHIFT
10797                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
10798                                                          XEXP (op0, 1)))
10799                 {
10800                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
10801                                      (enum rtx_code) code, 0, speed);
10802                   if (speed)
10803                     *cost += extra_cost->alu.bfx;
10804
10805                   return true;
10806                 }
10807               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
10808                 {
10809                 /* We possibly get the immediate for free, this is not
10810                    modelled.  */
10811                   *cost += rtx_cost (op0, int_mode,
10812                                      (enum rtx_code) code, 0, speed);
10813                   if (speed)
10814                     *cost += extra_cost->alu.logical;
10815
10816                   return true;
10817                 }
10818             }
10819           else
10820             {
10821               rtx new_op0 = op0;
10822
10823               /* Handle ORN, EON, or BIC.  */
10824               if (GET_CODE (op0) == NOT)
10825                 op0 = XEXP (op0, 0);
10826
10827               new_op0 = aarch64_strip_shift (op0);
10828
10829               /* If we had a shift on op0 then this is a logical-shift-
10830                  by-register/immediate operation.  Otherwise, this is just
10831                  a logical operation.  */
10832               if (speed)
10833                 {
10834                   if (new_op0 != op0)
10835                     {
10836                       /* Shift by immediate.  */
10837                       if (CONST_INT_P (XEXP (op0, 1)))
10838                         *cost += extra_cost->alu.log_shift;
10839                       else
10840                         *cost += extra_cost->alu.log_shift_reg;
10841                     }
10842                   else
10843                     *cost += extra_cost->alu.logical;
10844                 }
10845
10846               /* In both cases we want to cost both operands.  */
10847               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
10848                                  0, speed);
10849               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
10850                                  1, speed);
10851
10852               return true;
10853             }
10854         }
10855       return false;
10856
10857     case NOT:
10858       x = XEXP (x, 0);
10859       op0 = aarch64_strip_shift (x);
10860
10861       if (VECTOR_MODE_P (mode))
10862         {
10863           /* Vector NOT.  */
10864           *cost += extra_cost->vect.alu;
10865           return false;
10866         }
10867
10868       /* MVN-shifted-reg.  */
10869       if (op0 != x)
10870         {
10871           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10872
10873           if (speed)
10874             *cost += extra_cost->alu.log_shift;
10875
10876           return true;
10877         }
10878       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10879          Handle the second form here taking care that 'a' in the above can
10880          be a shift.  */
10881       else if (GET_CODE (op0) == XOR)
10882         {
10883           rtx newop0 = XEXP (op0, 0);
10884           rtx newop1 = XEXP (op0, 1);
10885           rtx op0_stripped = aarch64_strip_shift (newop0);
10886
10887           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
10888           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
10889
10890           if (speed)
10891             {
10892               if (op0_stripped != newop0)
10893                 *cost += extra_cost->alu.log_shift;
10894               else
10895                 *cost += extra_cost->alu.logical;
10896             }
10897
10898           return true;
10899         }
10900       /* MVN.  */
10901       if (speed)
10902         *cost += extra_cost->alu.logical;
10903
10904       return false;
10905
10906     case ZERO_EXTEND:
10907
10908       op0 = XEXP (x, 0);
10909       /* If a value is written in SI mode, then zero extended to DI
10910          mode, the operation will in general be free as a write to
10911          a 'w' register implicitly zeroes the upper bits of an 'x'
10912          register.  However, if this is
10913
10914            (set (reg) (zero_extend (reg)))
10915
10916          we must cost the explicit register move.  */
10917       if (mode == DImode
10918           && GET_MODE (op0) == SImode
10919           && outer == SET)
10920         {
10921           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
10922
10923         /* If OP_COST is non-zero, then the cost of the zero extend
10924            is effectively the cost of the inner operation.  Otherwise
10925            we have a MOV instruction and we take the cost from the MOV
10926            itself.  This is true independently of whether we are
10927            optimizing for space or time.  */
10928           if (op_cost)
10929             *cost = op_cost;
10930
10931           return true;
10932         }
10933       else if (MEM_P (op0))
10934         {
10935           /* All loads can zero extend to any size for free.  */
10936           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
10937           return true;
10938         }
10939
10940       op0 = aarch64_extend_bitfield_pattern_p (x);
10941       if (op0)
10942         {
10943           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
10944           if (speed)
10945             *cost += extra_cost->alu.bfx;
10946           return true;
10947         }
10948
10949       if (speed)
10950         {
10951           if (VECTOR_MODE_P (mode))
10952             {
10953               /* UMOV.  */
10954               *cost += extra_cost->vect.alu;
10955             }
10956           else
10957             {
10958               /* We generate an AND instead of UXTB/UXTH.  */
10959               *cost += extra_cost->alu.logical;
10960             }
10961         }
10962       return false;
10963
10964     case SIGN_EXTEND:
10965       if (MEM_P (XEXP (x, 0)))
10966         {
10967           /* LDRSH.  */
10968           if (speed)
10969             {
10970               rtx address = XEXP (XEXP (x, 0), 0);
10971               *cost += extra_cost->ldst.load_sign_extend;
10972
10973               *cost +=
10974                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10975                                                      0, speed));
10976             }
10977           return true;
10978         }
10979
10980       op0 = aarch64_extend_bitfield_pattern_p (x);
10981       if (op0)
10982         {
10983           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
10984           if (speed)
10985             *cost += extra_cost->alu.bfx;
10986           return true;
10987         }
10988
10989       if (speed)
10990         {
10991           if (VECTOR_MODE_P (mode))
10992             *cost += extra_cost->vect.alu;
10993           else
10994             *cost += extra_cost->alu.extend;
10995         }
10996       return false;
10997
10998     case ASHIFT:
10999       op0 = XEXP (x, 0);
11000       op1 = XEXP (x, 1);
11001
11002       if (CONST_INT_P (op1))
11003         {
11004           if (speed)
11005             {
11006               if (VECTOR_MODE_P (mode))
11007                 {
11008                   /* Vector shift (immediate).  */
11009                   *cost += extra_cost->vect.alu;
11010                 }
11011               else
11012                 {
11013                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
11014                      aliases.  */
11015                   *cost += extra_cost->alu.shift;
11016                 }
11017             }
11018
11019           /* We can incorporate zero/sign extend for free.  */
11020           if (GET_CODE (op0) == ZERO_EXTEND
11021               || GET_CODE (op0) == SIGN_EXTEND)
11022             op0 = XEXP (op0, 0);
11023
11024           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
11025           return true;
11026         }
11027       else
11028         {
11029           if (VECTOR_MODE_P (mode))
11030             {
11031               if (speed)
11032                 /* Vector shift (register).  */
11033                 *cost += extra_cost->vect.alu;
11034             }
11035           else
11036             {
11037               if (speed)
11038                 /* LSLV.  */
11039                 *cost += extra_cost->alu.shift_reg;
11040
11041               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11042                   && CONST_INT_P (XEXP (op1, 1))
11043                   && known_eq (INTVAL (XEXP (op1, 1)),
11044                                GET_MODE_BITSIZE (mode) - 1))
11045                 {
11046                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11047                   /* We already demanded XEXP (op1, 0) to be REG_P, so
11048                      don't recurse into it.  */
11049                   return true;
11050                 }
11051             }
11052           return false;  /* All arguments need to be in registers.  */
11053         }
11054
11055     case ROTATE:
11056     case ROTATERT:
11057     case LSHIFTRT:
11058     case ASHIFTRT:
11059       op0 = XEXP (x, 0);
11060       op1 = XEXP (x, 1);
11061
11062       if (CONST_INT_P (op1))
11063         {
11064           /* ASR (immediate) and friends.  */
11065           if (speed)
11066             {
11067               if (VECTOR_MODE_P (mode))
11068                 *cost += extra_cost->vect.alu;
11069               else
11070                 *cost += extra_cost->alu.shift;
11071             }
11072
11073           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
11074           return true;
11075         }
11076       else
11077         {
11078           if (VECTOR_MODE_P (mode))
11079             {
11080               if (speed)
11081                 /* Vector shift (register).  */
11082                 *cost += extra_cost->vect.alu;
11083             }
11084           else
11085             {
11086               if (speed)
11087                 /* ASR (register) and friends.  */
11088                 *cost += extra_cost->alu.shift_reg;
11089
11090               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11091                   && CONST_INT_P (XEXP (op1, 1))
11092                   && known_eq (INTVAL (XEXP (op1, 1)),
11093                                GET_MODE_BITSIZE (mode) - 1))
11094                 {
11095                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11096                   /* We already demanded XEXP (op1, 0) to be REG_P, so
11097                      don't recurse into it.  */
11098                   return true;
11099                 }
11100             }
11101           return false;  /* All arguments need to be in registers.  */
11102         }
11103
11104     case SYMBOL_REF:
11105
11106       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
11107           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
11108         {
11109           /* LDR.  */
11110           if (speed)
11111             *cost += extra_cost->ldst.load;
11112         }
11113       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
11114                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
11115         {
11116           /* ADRP, followed by ADD.  */
11117           *cost += COSTS_N_INSNS (1);
11118           if (speed)
11119             *cost += 2 * extra_cost->alu.arith;
11120         }
11121       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
11122                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11123         {
11124           /* ADR.  */
11125           if (speed)
11126             *cost += extra_cost->alu.arith;
11127         }
11128
11129       if (flag_pic)
11130         {
11131           /* One extra load instruction, after accessing the GOT.  */
11132           *cost += COSTS_N_INSNS (1);
11133           if (speed)
11134             *cost += extra_cost->ldst.load;
11135         }
11136       return true;
11137
11138     case HIGH:
11139     case LO_SUM:
11140       /* ADRP/ADD (immediate).  */
11141       if (speed)
11142         *cost += extra_cost->alu.arith;
11143       return true;
11144
11145     case ZERO_EXTRACT:
11146     case SIGN_EXTRACT:
11147       /* UBFX/SBFX.  */
11148       if (speed)
11149         {
11150           if (VECTOR_MODE_P (mode))
11151             *cost += extra_cost->vect.alu;
11152           else
11153             *cost += extra_cost->alu.bfx;
11154         }
11155
11156       /* We can trust that the immediates used will be correct (there
11157          are no by-register forms), so we need only cost op0.  */
11158       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
11159       return true;
11160
11161     case MULT:
11162       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
11163       /* aarch64_rtx_mult_cost always handles recursion to its
11164          operands.  */
11165       return true;
11166
11167     case MOD:
11168     /* We can expand signed mod by power of 2 using a NEGS, two parallel
11169        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
11170        an unconditional negate.  This case should only ever be reached through
11171        the set_smod_pow2_cheap check in expmed.c.  */
11172       if (CONST_INT_P (XEXP (x, 1))
11173           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
11174           && (mode == SImode || mode == DImode))
11175         {
11176           /* We expand to 4 instructions.  Reset the baseline.  */
11177           *cost = COSTS_N_INSNS (4);
11178
11179           if (speed)
11180             *cost += 2 * extra_cost->alu.logical
11181                      + 2 * extra_cost->alu.arith;
11182
11183           return true;
11184         }
11185
11186     /* Fall-through.  */
11187     case UMOD:
11188       if (speed)
11189         {
11190           /* Slighly prefer UMOD over SMOD.  */
11191           if (VECTOR_MODE_P (mode))
11192             *cost += extra_cost->vect.alu;
11193           else if (GET_MODE_CLASS (mode) == MODE_INT)
11194             *cost += (extra_cost->mult[mode == DImode].add
11195                       + extra_cost->mult[mode == DImode].idiv
11196                       + (code == MOD ? 1 : 0));
11197         }
11198       return false;  /* All arguments need to be in registers.  */
11199
11200     case DIV:
11201     case UDIV:
11202     case SQRT:
11203       if (speed)
11204         {
11205           if (VECTOR_MODE_P (mode))
11206             *cost += extra_cost->vect.alu;
11207           else if (GET_MODE_CLASS (mode) == MODE_INT)
11208             /* There is no integer SQRT, so only DIV and UDIV can get
11209                here.  */
11210             *cost += (extra_cost->mult[mode == DImode].idiv
11211                      /* Slighly prefer UDIV over SDIV.  */
11212                      + (code == DIV ? 1 : 0));
11213           else
11214             *cost += extra_cost->fp[mode == DFmode].div;
11215         }
11216       return false;  /* All arguments need to be in registers.  */
11217
11218     case IF_THEN_ELSE:
11219       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
11220                                          XEXP (x, 2), cost, speed);
11221
11222     case EQ:
11223     case NE:
11224     case GT:
11225     case GTU:
11226     case LT:
11227     case LTU:
11228     case GE:
11229     case GEU:
11230     case LE:
11231     case LEU:
11232
11233       return false; /* All arguments must be in registers.  */
11234
11235     case FMA:
11236       op0 = XEXP (x, 0);
11237       op1 = XEXP (x, 1);
11238       op2 = XEXP (x, 2);
11239
11240       if (speed)
11241         {
11242           if (VECTOR_MODE_P (mode))
11243             *cost += extra_cost->vect.alu;
11244           else
11245             *cost += extra_cost->fp[mode == DFmode].fma;
11246         }
11247
11248       /* FMSUB, FNMADD, and FNMSUB are free.  */
11249       if (GET_CODE (op0) == NEG)
11250         op0 = XEXP (op0, 0);
11251
11252       if (GET_CODE (op2) == NEG)
11253         op2 = XEXP (op2, 0);
11254
11255       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
11256          and the by-element operand as operand 0.  */
11257       if (GET_CODE (op1) == NEG)
11258         op1 = XEXP (op1, 0);
11259
11260       /* Catch vector-by-element operations.  The by-element operand can
11261          either be (vec_duplicate (vec_select (x))) or just
11262          (vec_select (x)), depending on whether we are multiplying by
11263          a vector or a scalar.
11264
11265          Canonicalization is not very good in these cases, FMA4 will put the
11266          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
11267       if (GET_CODE (op0) == VEC_DUPLICATE)
11268         op0 = XEXP (op0, 0);
11269       else if (GET_CODE (op1) == VEC_DUPLICATE)
11270         op1 = XEXP (op1, 0);
11271
11272       if (GET_CODE (op0) == VEC_SELECT)
11273         op0 = XEXP (op0, 0);
11274       else if (GET_CODE (op1) == VEC_SELECT)
11275         op1 = XEXP (op1, 0);
11276
11277       /* If the remaining parameters are not registers,
11278          get the cost to put them into registers.  */
11279       *cost += rtx_cost (op0, mode, FMA, 0, speed);
11280       *cost += rtx_cost (op1, mode, FMA, 1, speed);
11281       *cost += rtx_cost (op2, mode, FMA, 2, speed);
11282       return true;
11283
11284     case FLOAT:
11285     case UNSIGNED_FLOAT:
11286       if (speed)
11287         *cost += extra_cost->fp[mode == DFmode].fromint;
11288       return false;
11289
11290     case FLOAT_EXTEND:
11291       if (speed)
11292         {
11293           if (VECTOR_MODE_P (mode))
11294             {
11295               /*Vector truncate.  */
11296               *cost += extra_cost->vect.alu;
11297             }
11298           else
11299             *cost += extra_cost->fp[mode == DFmode].widen;
11300         }
11301       return false;
11302
11303     case FLOAT_TRUNCATE:
11304       if (speed)
11305         {
11306           if (VECTOR_MODE_P (mode))
11307             {
11308               /*Vector conversion.  */
11309               *cost += extra_cost->vect.alu;
11310             }
11311           else
11312             *cost += extra_cost->fp[mode == DFmode].narrow;
11313         }
11314       return false;
11315
11316     case FIX:
11317     case UNSIGNED_FIX:
11318       x = XEXP (x, 0);
11319       /* Strip the rounding part.  They will all be implemented
11320          by the fcvt* family of instructions anyway.  */
11321       if (GET_CODE (x) == UNSPEC)
11322         {
11323           unsigned int uns_code = XINT (x, 1);
11324
11325           if (uns_code == UNSPEC_FRINTA
11326               || uns_code == UNSPEC_FRINTM
11327               || uns_code == UNSPEC_FRINTN
11328               || uns_code == UNSPEC_FRINTP
11329               || uns_code == UNSPEC_FRINTZ)
11330             x = XVECEXP (x, 0, 0);
11331         }
11332
11333       if (speed)
11334         {
11335           if (VECTOR_MODE_P (mode))
11336             *cost += extra_cost->vect.alu;
11337           else
11338             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
11339         }
11340
11341       /* We can combine fmul by a power of 2 followed by a fcvt into a single
11342          fixed-point fcvt.  */
11343       if (GET_CODE (x) == MULT
11344           && ((VECTOR_MODE_P (mode)
11345                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
11346               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
11347         {
11348           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
11349                              0, speed);
11350           return true;
11351         }
11352
11353       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
11354       return true;
11355
11356     case ABS:
11357       if (VECTOR_MODE_P (mode))
11358         {
11359           /* ABS (vector).  */
11360           if (speed)
11361             *cost += extra_cost->vect.alu;
11362         }
11363       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11364         {
11365           op0 = XEXP (x, 0);
11366
11367           /* FABD, which is analogous to FADD.  */
11368           if (GET_CODE (op0) == MINUS)
11369             {
11370               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
11371               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
11372               if (speed)
11373                 *cost += extra_cost->fp[mode == DFmode].addsub;
11374
11375               return true;
11376             }
11377           /* Simple FABS is analogous to FNEG.  */
11378           if (speed)
11379             *cost += extra_cost->fp[mode == DFmode].neg;
11380         }
11381       else
11382         {
11383           /* Integer ABS will either be split to
11384              two arithmetic instructions, or will be an ABS
11385              (scalar), which we don't model.  */
11386           *cost = COSTS_N_INSNS (2);
11387           if (speed)
11388             *cost += 2 * extra_cost->alu.arith;
11389         }
11390       return false;
11391
11392     case SMAX:
11393     case SMIN:
11394       if (speed)
11395         {
11396           if (VECTOR_MODE_P (mode))
11397             *cost += extra_cost->vect.alu;
11398           else
11399             {
11400               /* FMAXNM/FMINNM/FMAX/FMIN.
11401                  TODO: This may not be accurate for all implementations, but
11402                  we do not model this in the cost tables.  */
11403               *cost += extra_cost->fp[mode == DFmode].addsub;
11404             }
11405         }
11406       return false;
11407
11408     case UNSPEC:
11409       /* The floating point round to integer frint* instructions.  */
11410       if (aarch64_frint_unspec_p (XINT (x, 1)))
11411         {
11412           if (speed)
11413             *cost += extra_cost->fp[mode == DFmode].roundint;
11414
11415           return false;
11416         }
11417
11418       if (XINT (x, 1) == UNSPEC_RBIT)
11419         {
11420           if (speed)
11421             *cost += extra_cost->alu.rev;
11422
11423           return false;
11424         }
11425       break;
11426
11427     case TRUNCATE:
11428
11429       /* Decompose <su>muldi3_highpart.  */
11430       if (/* (truncate:DI  */
11431           mode == DImode
11432           /*   (lshiftrt:TI  */
11433           && GET_MODE (XEXP (x, 0)) == TImode
11434           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
11435           /*      (mult:TI  */
11436           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
11437           /*        (ANY_EXTEND:TI (reg:DI))
11438                     (ANY_EXTEND:TI (reg:DI)))  */
11439           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
11440                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
11441               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
11442                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
11443           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
11444           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
11445           /*     (const_int 64)  */
11446           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
11447           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
11448         {
11449           /* UMULH/SMULH.  */
11450           if (speed)
11451             *cost += extra_cost->mult[mode == DImode].extend;
11452           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
11453                              mode, MULT, 0, speed);
11454           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
11455                              mode, MULT, 1, speed);
11456           return true;
11457         }
11458
11459       /* Fall through.  */
11460     default:
11461       break;
11462     }
11463
11464   if (dump_file
11465       && flag_aarch64_verbose_cost)
11466     fprintf (dump_file,
11467       "\nFailed to cost RTX.  Assuming default cost.\n");
11468
11469   return true;
11470 }
11471
11472 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
11473    calculated for X.  This cost is stored in *COST.  Returns true
11474    if the total cost of X was calculated.  */
11475 static bool
11476 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
11477                    int param, int *cost, bool speed)
11478 {
11479   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
11480
11481   if (dump_file
11482       && flag_aarch64_verbose_cost)
11483     {
11484       print_rtl_single (dump_file, x);
11485       fprintf (dump_file, "\n%s cost: %d (%s)\n",
11486                speed ? "Hot" : "Cold",
11487                *cost, result ? "final" : "partial");
11488     }
11489
11490   return result;
11491 }
11492
11493 static int
11494 aarch64_register_move_cost (machine_mode mode,
11495                             reg_class_t from_i, reg_class_t to_i)
11496 {
11497   enum reg_class from = (enum reg_class) from_i;
11498   enum reg_class to = (enum reg_class) to_i;
11499   const struct cpu_regmove_cost *regmove_cost
11500     = aarch64_tune_params.regmove_cost;
11501
11502   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
11503   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
11504     to = GENERAL_REGS;
11505
11506   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
11507     from = GENERAL_REGS;
11508
11509   /* Moving between GPR and stack cost is the same as GP2GP.  */
11510   if ((from == GENERAL_REGS && to == STACK_REG)
11511       || (to == GENERAL_REGS && from == STACK_REG))
11512     return regmove_cost->GP2GP;
11513
11514   /* To/From the stack register, we move via the gprs.  */
11515   if (to == STACK_REG || from == STACK_REG)
11516     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
11517             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
11518
11519   if (known_eq (GET_MODE_SIZE (mode), 16))
11520     {
11521       /* 128-bit operations on general registers require 2 instructions.  */
11522       if (from == GENERAL_REGS && to == GENERAL_REGS)
11523         return regmove_cost->GP2GP * 2;
11524       else if (from == GENERAL_REGS)
11525         return regmove_cost->GP2FP * 2;
11526       else if (to == GENERAL_REGS)
11527         return regmove_cost->FP2GP * 2;
11528
11529       /* When AdvSIMD instructions are disabled it is not possible to move
11530          a 128-bit value directly between Q registers.  This is handled in
11531          secondary reload.  A general register is used as a scratch to move
11532          the upper DI value and the lower DI value is moved directly,
11533          hence the cost is the sum of three moves. */
11534       if (! TARGET_SIMD)
11535         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
11536
11537       return regmove_cost->FP2FP;
11538     }
11539
11540   if (from == GENERAL_REGS && to == GENERAL_REGS)
11541     return regmove_cost->GP2GP;
11542   else if (from == GENERAL_REGS)
11543     return regmove_cost->GP2FP;
11544   else if (to == GENERAL_REGS)
11545     return regmove_cost->FP2GP;
11546
11547   return regmove_cost->FP2FP;
11548 }
11549
11550 static int
11551 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
11552                           reg_class_t rclass ATTRIBUTE_UNUSED,
11553                           bool in ATTRIBUTE_UNUSED)
11554 {
11555   return aarch64_tune_params.memmov_cost;
11556 }
11557
11558 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
11559    to optimize 1.0/sqrt.  */
11560
11561 static bool
11562 use_rsqrt_p (machine_mode mode)
11563 {
11564   return (!flag_trapping_math
11565           && flag_unsafe_math_optimizations
11566           && ((aarch64_tune_params.approx_modes->recip_sqrt
11567                & AARCH64_APPROX_MODE (mode))
11568               || flag_mrecip_low_precision_sqrt));
11569 }
11570
11571 /* Function to decide when to use the approximate reciprocal square root
11572    builtin.  */
11573
11574 static tree
11575 aarch64_builtin_reciprocal (tree fndecl)
11576 {
11577   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
11578
11579   if (!use_rsqrt_p (mode))
11580     return NULL_TREE;
11581   return aarch64_builtin_rsqrt (DECL_MD_FUNCTION_CODE (fndecl));
11582 }
11583
11584 /* Emit instruction sequence to compute either the approximate square root
11585    or its approximate reciprocal, depending on the flag RECP, and return
11586    whether the sequence was emitted or not.  */
11587
11588 bool
11589 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
11590 {
11591   machine_mode mode = GET_MODE (dst);
11592
11593   if (GET_MODE_INNER (mode) == HFmode)
11594     {
11595       gcc_assert (!recp);
11596       return false;
11597     }
11598
11599   if (!recp)
11600     {
11601       if (!(flag_mlow_precision_sqrt
11602             || (aarch64_tune_params.approx_modes->sqrt
11603                 & AARCH64_APPROX_MODE (mode))))
11604         return false;
11605
11606       if (flag_finite_math_only
11607           || flag_trapping_math
11608           || !flag_unsafe_math_optimizations
11609           || optimize_function_for_size_p (cfun))
11610         return false;
11611     }
11612   else
11613     /* Caller assumes we cannot fail.  */
11614     gcc_assert (use_rsqrt_p (mode));
11615
11616   machine_mode mmsk = mode_for_int_vector (mode).require ();
11617   rtx xmsk = gen_reg_rtx (mmsk);
11618   if (!recp)
11619     /* When calculating the approximate square root, compare the
11620        argument with 0.0 and create a mask.  */
11621     emit_insn (gen_rtx_SET (xmsk,
11622                             gen_rtx_NEG (mmsk,
11623                                          gen_rtx_EQ (mmsk, src,
11624                                                      CONST0_RTX (mode)))));
11625
11626   /* Estimate the approximate reciprocal square root.  */
11627   rtx xdst = gen_reg_rtx (mode);
11628   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
11629
11630   /* Iterate over the series twice for SF and thrice for DF.  */
11631   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11632
11633   /* Optionally iterate over the series once less for faster performance
11634      while sacrificing the accuracy.  */
11635   if ((recp && flag_mrecip_low_precision_sqrt)
11636       || (!recp && flag_mlow_precision_sqrt))
11637     iterations--;
11638
11639   /* Iterate over the series to calculate the approximate reciprocal square
11640      root.  */
11641   rtx x1 = gen_reg_rtx (mode);
11642   while (iterations--)
11643     {
11644       rtx x2 = gen_reg_rtx (mode);
11645       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
11646
11647       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
11648
11649       if (iterations > 0)
11650         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
11651     }
11652
11653   if (!recp)
11654     {
11655       /* Qualify the approximate reciprocal square root when the argument is
11656          0.0 by squashing the intermediary result to 0.0.  */
11657       rtx xtmp = gen_reg_rtx (mmsk);
11658       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
11659                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
11660       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
11661
11662       /* Calculate the approximate square root.  */
11663       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
11664     }
11665
11666   /* Finalize the approximation.  */
11667   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
11668
11669   return true;
11670 }
11671
11672 /* Emit the instruction sequence to compute the approximation for the division
11673    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
11674
11675 bool
11676 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
11677 {
11678   machine_mode mode = GET_MODE (quo);
11679
11680   if (GET_MODE_INNER (mode) == HFmode)
11681     return false;
11682
11683   bool use_approx_division_p = (flag_mlow_precision_div
11684                                 || (aarch64_tune_params.approx_modes->division
11685                                     & AARCH64_APPROX_MODE (mode)));
11686
11687   if (!flag_finite_math_only
11688       || flag_trapping_math
11689       || !flag_unsafe_math_optimizations
11690       || optimize_function_for_size_p (cfun)
11691       || !use_approx_division_p)
11692     return false;
11693
11694   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
11695     return false;
11696
11697   /* Estimate the approximate reciprocal.  */
11698   rtx xrcp = gen_reg_rtx (mode);
11699   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
11700
11701   /* Iterate over the series twice for SF and thrice for DF.  */
11702   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11703
11704   /* Optionally iterate over the series once less for faster performance,
11705      while sacrificing the accuracy.  */
11706   if (flag_mlow_precision_div)
11707     iterations--;
11708
11709   /* Iterate over the series to calculate the approximate reciprocal.  */
11710   rtx xtmp = gen_reg_rtx (mode);
11711   while (iterations--)
11712     {
11713       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
11714
11715       if (iterations > 0)
11716         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
11717     }
11718
11719   if (num != CONST1_RTX (mode))
11720     {
11721       /* As the approximate reciprocal of DEN is already calculated, only
11722          calculate the approximate division when NUM is not 1.0.  */
11723       rtx xnum = force_reg (mode, num);
11724       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
11725     }
11726
11727   /* Finalize the approximation.  */
11728   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
11729   return true;
11730 }
11731
11732 /* Return the number of instructions that can be issued per cycle.  */
11733 static int
11734 aarch64_sched_issue_rate (void)
11735 {
11736   return aarch64_tune_params.issue_rate;
11737 }
11738
11739 static int
11740 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
11741 {
11742   int issue_rate = aarch64_sched_issue_rate ();
11743
11744   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
11745 }
11746
11747
11748 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
11749    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
11750    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
11751
11752 static int
11753 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
11754                                                     int ready_index)
11755 {
11756   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
11757 }
11758
11759
11760 /* Vectorizer cost model target hooks.  */
11761
11762 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
11763 static int
11764 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
11765                                     tree vectype,
11766                                     int misalign ATTRIBUTE_UNUSED)
11767 {
11768   unsigned elements;
11769   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
11770   bool fp = false;
11771
11772   if (vectype != NULL)
11773     fp = FLOAT_TYPE_P (vectype);
11774
11775   switch (type_of_cost)
11776     {
11777       case scalar_stmt:
11778         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
11779
11780       case scalar_load:
11781         return costs->scalar_load_cost;
11782
11783       case scalar_store:
11784         return costs->scalar_store_cost;
11785
11786       case vector_stmt:
11787         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11788
11789       case vector_load:
11790         return costs->vec_align_load_cost;
11791
11792       case vector_store:
11793         return costs->vec_store_cost;
11794
11795       case vec_to_scalar:
11796         return costs->vec_to_scalar_cost;
11797
11798       case scalar_to_vec:
11799         return costs->scalar_to_vec_cost;
11800
11801       case unaligned_load:
11802       case vector_gather_load:
11803         return costs->vec_unalign_load_cost;
11804
11805       case unaligned_store:
11806       case vector_scatter_store:
11807         return costs->vec_unalign_store_cost;
11808
11809       case cond_branch_taken:
11810         return costs->cond_taken_branch_cost;
11811
11812       case cond_branch_not_taken:
11813         return costs->cond_not_taken_branch_cost;
11814
11815       case vec_perm:
11816         return costs->vec_permute_cost;
11817
11818       case vec_promote_demote:
11819         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11820
11821       case vec_construct:
11822         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
11823         return elements / 2 + 1;
11824
11825       default:
11826         gcc_unreachable ();
11827     }
11828 }
11829
11830 /* Implement targetm.vectorize.add_stmt_cost.  */
11831 static unsigned
11832 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
11833                        struct _stmt_vec_info *stmt_info, int misalign,
11834                        enum vect_cost_model_location where)
11835 {
11836   unsigned *cost = (unsigned *) data;
11837   unsigned retval = 0;
11838
11839   if (flag_vect_cost_model)
11840     {
11841       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
11842       int stmt_cost =
11843             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
11844
11845       /* Statements in an inner loop relative to the loop being
11846          vectorized are weighted more heavily.  The value here is
11847          arbitrary and could potentially be improved with analysis.  */
11848       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
11849         count *= 50; /*  FIXME  */
11850
11851       retval = (unsigned) (count * stmt_cost);
11852       cost[where] += retval;
11853     }
11854
11855   return retval;
11856 }
11857
11858 static void initialize_aarch64_code_model (struct gcc_options *);
11859
11860 /* Parse the TO_PARSE string and put the architecture struct that it
11861    selects into RES and the architectural features into ISA_FLAGS.
11862    Return an aarch64_parse_opt_result describing the parse result.
11863    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11864    When the TO_PARSE string contains an invalid extension,
11865    a copy of the string is created and stored to INVALID_EXTENSION.  */
11866
11867 static enum aarch64_parse_opt_result
11868 aarch64_parse_arch (const char *to_parse, const struct processor **res,
11869                     uint64_t *isa_flags, std::string *invalid_extension)
11870 {
11871   const char *ext;
11872   const struct processor *arch;
11873   size_t len;
11874
11875   ext = strchr (to_parse, '+');
11876
11877   if (ext != NULL)
11878     len = ext - to_parse;
11879   else
11880     len = strlen (to_parse);
11881
11882   if (len == 0)
11883     return AARCH64_PARSE_MISSING_ARG;
11884
11885
11886   /* Loop through the list of supported ARCHes to find a match.  */
11887   for (arch = all_architectures; arch->name != NULL; arch++)
11888     {
11889       if (strlen (arch->name) == len
11890           && strncmp (arch->name, to_parse, len) == 0)
11891         {
11892           uint64_t isa_temp = arch->flags;
11893
11894           if (ext != NULL)
11895             {
11896               /* TO_PARSE string contains at least one extension.  */
11897               enum aarch64_parse_opt_result ext_res
11898                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11899
11900               if (ext_res != AARCH64_PARSE_OK)
11901                 return ext_res;
11902             }
11903           /* Extension parsing was successful.  Confirm the result
11904              arch and ISA flags.  */
11905           *res = arch;
11906           *isa_flags = isa_temp;
11907           return AARCH64_PARSE_OK;
11908         }
11909     }
11910
11911   /* ARCH name not found in list.  */
11912   return AARCH64_PARSE_INVALID_ARG;
11913 }
11914
11915 /* Parse the TO_PARSE string and put the result tuning in RES and the
11916    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
11917    describing the parse result.  If there is an error parsing, RES and
11918    ISA_FLAGS are left unchanged.
11919    When the TO_PARSE string contains an invalid extension,
11920    a copy of the string is created and stored to INVALID_EXTENSION.  */
11921
11922 static enum aarch64_parse_opt_result
11923 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
11924                    uint64_t *isa_flags, std::string *invalid_extension)
11925 {
11926   const char *ext;
11927   const struct processor *cpu;
11928   size_t len;
11929
11930   ext = strchr (to_parse, '+');
11931
11932   if (ext != NULL)
11933     len = ext - to_parse;
11934   else
11935     len = strlen (to_parse);
11936
11937   if (len == 0)
11938     return AARCH64_PARSE_MISSING_ARG;
11939
11940
11941   /* Loop through the list of supported CPUs to find a match.  */
11942   for (cpu = all_cores; cpu->name != NULL; cpu++)
11943     {
11944       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
11945         {
11946           uint64_t isa_temp = cpu->flags;
11947
11948
11949           if (ext != NULL)
11950             {
11951               /* TO_PARSE string contains at least one extension.  */
11952               enum aarch64_parse_opt_result ext_res
11953                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11954
11955               if (ext_res != AARCH64_PARSE_OK)
11956                 return ext_res;
11957             }
11958           /* Extension parsing was successfull.  Confirm the result
11959              cpu and ISA flags.  */
11960           *res = cpu;
11961           *isa_flags = isa_temp;
11962           return AARCH64_PARSE_OK;
11963         }
11964     }
11965
11966   /* CPU name not found in list.  */
11967   return AARCH64_PARSE_INVALID_ARG;
11968 }
11969
11970 /* Parse the TO_PARSE string and put the cpu it selects into RES.
11971    Return an aarch64_parse_opt_result describing the parse result.
11972    If the parsing fails the RES does not change.  */
11973
11974 static enum aarch64_parse_opt_result
11975 aarch64_parse_tune (const char *to_parse, const struct processor **res)
11976 {
11977   const struct processor *cpu;
11978
11979   /* Loop through the list of supported CPUs to find a match.  */
11980   for (cpu = all_cores; cpu->name != NULL; cpu++)
11981     {
11982       if (strcmp (cpu->name, to_parse) == 0)
11983         {
11984           *res = cpu;
11985           return AARCH64_PARSE_OK;
11986         }
11987     }
11988
11989   /* CPU name not found in list.  */
11990   return AARCH64_PARSE_INVALID_ARG;
11991 }
11992
11993 /* Parse TOKEN, which has length LENGTH to see if it is an option
11994    described in FLAG.  If it is, return the index bit for that fusion type.
11995    If not, error (printing OPTION_NAME) and return zero.  */
11996
11997 static unsigned int
11998 aarch64_parse_one_option_token (const char *token,
11999                                 size_t length,
12000                                 const struct aarch64_flag_desc *flag,
12001                                 const char *option_name)
12002 {
12003   for (; flag->name != NULL; flag++)
12004     {
12005       if (length == strlen (flag->name)
12006           && !strncmp (flag->name, token, length))
12007         return flag->flag;
12008     }
12009
12010   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
12011   return 0;
12012 }
12013
12014 /* Parse OPTION which is a comma-separated list of flags to enable.
12015    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
12016    default state we inherit from the CPU tuning structures.  OPTION_NAME
12017    gives the top-level option we are parsing in the -moverride string,
12018    for use in error messages.  */
12019
12020 static unsigned int
12021 aarch64_parse_boolean_options (const char *option,
12022                                const struct aarch64_flag_desc *flags,
12023                                unsigned int initial_state,
12024                                const char *option_name)
12025 {
12026   const char separator = '.';
12027   const char* specs = option;
12028   const char* ntoken = option;
12029   unsigned int found_flags = initial_state;
12030
12031   while ((ntoken = strchr (specs, separator)))
12032     {
12033       size_t token_length = ntoken - specs;
12034       unsigned token_ops = aarch64_parse_one_option_token (specs,
12035                                                            token_length,
12036                                                            flags,
12037                                                            option_name);
12038       /* If we find "none" (or, for simplicity's sake, an error) anywhere
12039          in the token stream, reset the supported operations.  So:
12040
12041            adrp+add.cmp+branch.none.adrp+add
12042
12043            would have the result of turning on only adrp+add fusion.  */
12044       if (!token_ops)
12045         found_flags = 0;
12046
12047       found_flags |= token_ops;
12048       specs = ++ntoken;
12049     }
12050
12051   /* We ended with a comma, print something.  */
12052   if (!(*specs))
12053     {
12054       error ("%s string ill-formed\n", option_name);
12055       return 0;
12056     }
12057
12058   /* We still have one more token to parse.  */
12059   size_t token_length = strlen (specs);
12060   unsigned token_ops = aarch64_parse_one_option_token (specs,
12061                                                        token_length,
12062                                                        flags,
12063                                                        option_name);
12064    if (!token_ops)
12065      found_flags = 0;
12066
12067   found_flags |= token_ops;
12068   return found_flags;
12069 }
12070
12071 /* Support for overriding instruction fusion.  */
12072
12073 static void
12074 aarch64_parse_fuse_string (const char *fuse_string,
12075                             struct tune_params *tune)
12076 {
12077   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
12078                                                      aarch64_fusible_pairs,
12079                                                      tune->fusible_ops,
12080                                                      "fuse=");
12081 }
12082
12083 /* Support for overriding other tuning flags.  */
12084
12085 static void
12086 aarch64_parse_tune_string (const char *tune_string,
12087                             struct tune_params *tune)
12088 {
12089   tune->extra_tuning_flags
12090     = aarch64_parse_boolean_options (tune_string,
12091                                      aarch64_tuning_flags,
12092                                      tune->extra_tuning_flags,
12093                                      "tune=");
12094 }
12095
12096 /* Parse the sve_width tuning moverride string in TUNE_STRING.
12097    Accept the valid SVE vector widths allowed by
12098    aarch64_sve_vector_bits_enum and use it to override sve_width
12099    in TUNE.  */
12100
12101 static void
12102 aarch64_parse_sve_width_string (const char *tune_string,
12103                                 struct tune_params *tune)
12104 {
12105   int width = -1;
12106
12107   int n = sscanf (tune_string, "%d", &width);
12108   if (n == EOF)
12109     {
12110       error ("invalid format for sve_width");
12111       return;
12112     }
12113   switch (width)
12114     {
12115     case SVE_128:
12116     case SVE_256:
12117     case SVE_512:
12118     case SVE_1024:
12119     case SVE_2048:
12120       break;
12121     default:
12122       error ("invalid sve_width value: %d", width);
12123     }
12124   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
12125 }
12126
12127 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
12128    we understand.  If it is, extract the option string and handoff to
12129    the appropriate function.  */
12130
12131 void
12132 aarch64_parse_one_override_token (const char* token,
12133                                   size_t length,
12134                                   struct tune_params *tune)
12135 {
12136   const struct aarch64_tuning_override_function *fn
12137     = aarch64_tuning_override_functions;
12138
12139   const char *option_part = strchr (token, '=');
12140   if (!option_part)
12141     {
12142       error ("tuning string missing in option (%s)", token);
12143       return;
12144     }
12145
12146   /* Get the length of the option name.  */
12147   length = option_part - token;
12148   /* Skip the '=' to get to the option string.  */
12149   option_part++;
12150
12151   for (; fn->name != NULL; fn++)
12152     {
12153       if (!strncmp (fn->name, token, length))
12154         {
12155           fn->parse_override (option_part, tune);
12156           return;
12157         }
12158     }
12159
12160   error ("unknown tuning option (%s)",token);
12161   return;
12162 }
12163
12164 /* A checking mechanism for the implementation of the tls size.  */
12165
12166 static void
12167 initialize_aarch64_tls_size (struct gcc_options *opts)
12168 {
12169   if (aarch64_tls_size == 0)
12170     aarch64_tls_size = 24;
12171
12172   switch (opts->x_aarch64_cmodel_var)
12173     {
12174     case AARCH64_CMODEL_TINY:
12175       /* Both the default and maximum TLS size allowed under tiny is 1M which
12176          needs two instructions to address, so we clamp the size to 24.  */
12177       if (aarch64_tls_size > 24)
12178         aarch64_tls_size = 24;
12179       break;
12180     case AARCH64_CMODEL_SMALL:
12181       /* The maximum TLS size allowed under small is 4G.  */
12182       if (aarch64_tls_size > 32)
12183         aarch64_tls_size = 32;
12184       break;
12185     case AARCH64_CMODEL_LARGE:
12186       /* The maximum TLS size allowed under large is 16E.
12187          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
12188       if (aarch64_tls_size > 48)
12189         aarch64_tls_size = 48;
12190       break;
12191     default:
12192       gcc_unreachable ();
12193     }
12194
12195   return;
12196 }
12197
12198 /* Parse STRING looking for options in the format:
12199      string     :: option:string
12200      option     :: name=substring
12201      name       :: {a-z}
12202      substring  :: defined by option.  */
12203
12204 static void
12205 aarch64_parse_override_string (const char* input_string,
12206                                struct tune_params* tune)
12207 {
12208   const char separator = ':';
12209   size_t string_length = strlen (input_string) + 1;
12210   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
12211   char *string = string_root;
12212   strncpy (string, input_string, string_length);
12213   string[string_length - 1] = '\0';
12214
12215   char* ntoken = string;
12216
12217   while ((ntoken = strchr (string, separator)))
12218     {
12219       size_t token_length = ntoken - string;
12220       /* Make this substring look like a string.  */
12221       *ntoken = '\0';
12222       aarch64_parse_one_override_token (string, token_length, tune);
12223       string = ++ntoken;
12224     }
12225
12226   /* One last option to parse.  */
12227   aarch64_parse_one_override_token (string, strlen (string), tune);
12228   free (string_root);
12229 }
12230
12231
12232 static void
12233 aarch64_override_options_after_change_1 (struct gcc_options *opts)
12234 {
12235   if (accepted_branch_protection_string)
12236     {
12237       opts->x_aarch64_branch_protection_string
12238         = xstrdup (accepted_branch_protection_string);
12239     }
12240
12241   /* PR 70044: We have to be careful about being called multiple times for the
12242      same function.  This means all changes should be repeatable.  */
12243
12244   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
12245      Disable the frame pointer flag so the mid-end will not use a frame
12246      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
12247      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
12248      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
12249   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
12250   if (opts->x_flag_omit_frame_pointer == 0)
12251     opts->x_flag_omit_frame_pointer = 2;
12252
12253   /* If not optimizing for size, set the default
12254      alignment to what the target wants.  */
12255   if (!opts->x_optimize_size)
12256     {
12257       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
12258         opts->x_str_align_loops = aarch64_tune_params.loop_align;
12259       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
12260         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
12261       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
12262         opts->x_str_align_functions = aarch64_tune_params.function_align;
12263     }
12264
12265   /* We default to no pc-relative literal loads.  */
12266
12267   aarch64_pcrelative_literal_loads = false;
12268
12269   /* If -mpc-relative-literal-loads is set on the command line, this
12270      implies that the user asked for PC relative literal loads.  */
12271   if (opts->x_pcrelative_literal_loads == 1)
12272     aarch64_pcrelative_literal_loads = true;
12273
12274   /* In the tiny memory model it makes no sense to disallow PC relative
12275      literal pool loads.  */
12276   if (aarch64_cmodel == AARCH64_CMODEL_TINY
12277       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
12278     aarch64_pcrelative_literal_loads = true;
12279
12280   /* When enabling the lower precision Newton series for the square root, also
12281      enable it for the reciprocal square root, since the latter is an
12282      intermediary step for the former.  */
12283   if (flag_mlow_precision_sqrt)
12284     flag_mrecip_low_precision_sqrt = true;
12285 }
12286
12287 /* 'Unpack' up the internal tuning structs and update the options
12288     in OPTS.  The caller must have set up selected_tune and selected_arch
12289     as all the other target-specific codegen decisions are
12290     derived from them.  */
12291
12292 void
12293 aarch64_override_options_internal (struct gcc_options *opts)
12294 {
12295   aarch64_tune_flags = selected_tune->flags;
12296   aarch64_tune = selected_tune->sched_core;
12297   /* Make a copy of the tuning parameters attached to the core, which
12298      we may later overwrite.  */
12299   aarch64_tune_params = *(selected_tune->tune);
12300   aarch64_architecture_version = selected_arch->architecture_version;
12301
12302   if (opts->x_aarch64_override_tune_string)
12303     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
12304                                   &aarch64_tune_params);
12305
12306   /* This target defaults to strict volatile bitfields.  */
12307   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
12308     opts->x_flag_strict_volatile_bitfields = 1;
12309
12310   if (aarch64_stack_protector_guard == SSP_GLOBAL
12311       && opts->x_aarch64_stack_protector_guard_offset_str)
12312     {
12313       error ("incompatible options %<-mstack-protector-guard=global%> and "
12314              "%<-mstack-protector-guard-offset=%s%>",
12315              aarch64_stack_protector_guard_offset_str);
12316     }
12317
12318   if (aarch64_stack_protector_guard == SSP_SYSREG
12319       && !(opts->x_aarch64_stack_protector_guard_offset_str
12320            && opts->x_aarch64_stack_protector_guard_reg_str))
12321     {
12322       error ("both %<-mstack-protector-guard-offset%> and "
12323              "%<-mstack-protector-guard-reg%> must be used "
12324              "with %<-mstack-protector-guard=sysreg%>");
12325     }
12326
12327   if (opts->x_aarch64_stack_protector_guard_reg_str)
12328     {
12329       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
12330           error ("specify a system register with a small string length.");
12331     }
12332
12333   if (opts->x_aarch64_stack_protector_guard_offset_str)
12334     {
12335       char *end;
12336       const char *str = aarch64_stack_protector_guard_offset_str;
12337       errno = 0;
12338       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
12339       if (!*str || *end || errno)
12340         error ("%qs is not a valid offset in %qs", str,
12341                "-mstack-protector-guard-offset=");
12342       aarch64_stack_protector_guard_offset = offs;
12343     }
12344
12345   initialize_aarch64_code_model (opts);
12346   initialize_aarch64_tls_size (opts);
12347
12348   int queue_depth = 0;
12349   switch (aarch64_tune_params.autoprefetcher_model)
12350     {
12351       case tune_params::AUTOPREFETCHER_OFF:
12352         queue_depth = -1;
12353         break;
12354       case tune_params::AUTOPREFETCHER_WEAK:
12355         queue_depth = 0;
12356         break;
12357       case tune_params::AUTOPREFETCHER_STRONG:
12358         queue_depth = max_insn_queue_index + 1;
12359         break;
12360       default:
12361         gcc_unreachable ();
12362     }
12363
12364   /* We don't mind passing in global_options_set here as we don't use
12365      the *options_set structs anyway.  */
12366   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
12367                          queue_depth,
12368                          opts->x_param_values,
12369                          global_options_set.x_param_values);
12370
12371   /* Set up parameters to be used in prefetching algorithm.  Do not
12372      override the defaults unless we are tuning for a core we have
12373      researched values for.  */
12374   if (aarch64_tune_params.prefetch->num_slots > 0)
12375     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
12376                            aarch64_tune_params.prefetch->num_slots,
12377                            opts->x_param_values,
12378                            global_options_set.x_param_values);
12379   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
12380     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
12381                            aarch64_tune_params.prefetch->l1_cache_size,
12382                            opts->x_param_values,
12383                            global_options_set.x_param_values);
12384   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
12385     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
12386                            aarch64_tune_params.prefetch->l1_cache_line_size,
12387                            opts->x_param_values,
12388                            global_options_set.x_param_values);
12389   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
12390     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
12391                            aarch64_tune_params.prefetch->l2_cache_size,
12392                            opts->x_param_values,
12393                            global_options_set.x_param_values);
12394   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
12395     maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
12396                            0,
12397                            opts->x_param_values,
12398                            global_options_set.x_param_values);
12399   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
12400     maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
12401                            aarch64_tune_params.prefetch->minimum_stride,
12402                            opts->x_param_values,
12403                            global_options_set.x_param_values);
12404
12405   /* Use the alternative scheduling-pressure algorithm by default.  */
12406   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
12407                          opts->x_param_values,
12408                          global_options_set.x_param_values);
12409
12410   /* If the user hasn't changed it via configure then set the default to 64 KB
12411      for the backend.  */
12412   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
12413                          DEFAULT_STK_CLASH_GUARD_SIZE == 0
12414                            ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
12415                          opts->x_param_values,
12416                          global_options_set.x_param_values);
12417
12418   /* Validate the guard size.  */
12419   int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
12420
12421   /* Enforce that interval is the same size as size so the mid-end does the
12422      right thing.  */
12423   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
12424                          guard_size,
12425                          opts->x_param_values,
12426                          global_options_set.x_param_values);
12427
12428   /* The maybe_set calls won't update the value if the user has explicitly set
12429      one.  Which means we need to validate that probing interval and guard size
12430      are equal.  */
12431   int probe_interval
12432     = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
12433   if (guard_size != probe_interval)
12434     error ("stack clash guard size %<%d%> must be equal to probing interval "
12435            "%<%d%>", guard_size, probe_interval);
12436
12437   /* Enable sw prefetching at specified optimization level for
12438      CPUS that have prefetch.  Lower optimization level threshold by 1
12439      when profiling is enabled.  */
12440   if (opts->x_flag_prefetch_loop_arrays < 0
12441       && !opts->x_optimize_size
12442       && aarch64_tune_params.prefetch->default_opt_level >= 0
12443       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
12444     opts->x_flag_prefetch_loop_arrays = 1;
12445
12446   if (opts->x_aarch64_arch_string == NULL)
12447     opts->x_aarch64_arch_string = selected_arch->name;
12448   if (opts->x_aarch64_cpu_string == NULL)
12449     opts->x_aarch64_cpu_string = selected_cpu->name;
12450   if (opts->x_aarch64_tune_string == NULL)
12451     opts->x_aarch64_tune_string = selected_tune->name;
12452
12453   aarch64_override_options_after_change_1 (opts);
12454 }
12455
12456 /* Print a hint with a suggestion for a core or architecture name that
12457    most closely resembles what the user passed in STR.  ARCH is true if
12458    the user is asking for an architecture name.  ARCH is false if the user
12459    is asking for a core name.  */
12460
12461 static void
12462 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
12463 {
12464   auto_vec<const char *> candidates;
12465   const struct processor *entry = arch ? all_architectures : all_cores;
12466   for (; entry->name != NULL; entry++)
12467     candidates.safe_push (entry->name);
12468
12469 #ifdef HAVE_LOCAL_CPU_DETECT
12470   /* Add also "native" as possible value.  */
12471   if (arch)
12472     candidates.safe_push ("native");
12473 #endif
12474
12475   char *s;
12476   const char *hint = candidates_list_and_hint (str, s, candidates);
12477   if (hint)
12478     inform (input_location, "valid arguments are: %s;"
12479                              " did you mean %qs?", s, hint);
12480   else
12481     inform (input_location, "valid arguments are: %s", s);
12482
12483   XDELETEVEC (s);
12484 }
12485
12486 /* Print a hint with a suggestion for a core name that most closely resembles
12487    what the user passed in STR.  */
12488
12489 inline static void
12490 aarch64_print_hint_for_core (const char *str)
12491 {
12492   aarch64_print_hint_for_core_or_arch (str, false);
12493 }
12494
12495 /* Print a hint with a suggestion for an architecture name that most closely
12496    resembles what the user passed in STR.  */
12497
12498 inline static void
12499 aarch64_print_hint_for_arch (const char *str)
12500 {
12501   aarch64_print_hint_for_core_or_arch (str, true);
12502 }
12503
12504
12505 /* Print a hint with a suggestion for an extension name
12506    that most closely resembles what the user passed in STR.  */
12507
12508 void
12509 aarch64_print_hint_for_extensions (const std::string &str)
12510 {
12511   auto_vec<const char *> candidates;
12512   aarch64_get_all_extension_candidates (&candidates);
12513   char *s;
12514   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
12515   if (hint)
12516     inform (input_location, "valid arguments are: %s;"
12517                              " did you mean %qs?", s, hint);
12518   else
12519     inform (input_location, "valid arguments are: %s;", s);
12520
12521   XDELETEVEC (s);
12522 }
12523
12524 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
12525    specified in STR and throw errors if appropriate.  Put the results if
12526    they are valid in RES and ISA_FLAGS.  Return whether the option is
12527    valid.  */
12528
12529 static bool
12530 aarch64_validate_mcpu (const char *str, const struct processor **res,
12531                        uint64_t *isa_flags)
12532 {
12533   std::string invalid_extension;
12534   enum aarch64_parse_opt_result parse_res
12535     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
12536
12537   if (parse_res == AARCH64_PARSE_OK)
12538     return true;
12539
12540   switch (parse_res)
12541     {
12542       case AARCH64_PARSE_MISSING_ARG:
12543         error ("missing cpu name in %<-mcpu=%s%>", str);
12544         break;
12545       case AARCH64_PARSE_INVALID_ARG:
12546         error ("unknown value %qs for %<-mcpu%>", str);
12547         aarch64_print_hint_for_core (str);
12548         break;
12549       case AARCH64_PARSE_INVALID_FEATURE:
12550         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
12551                invalid_extension.c_str (), str);
12552         aarch64_print_hint_for_extensions (invalid_extension);
12553         break;
12554       default:
12555         gcc_unreachable ();
12556     }
12557
12558   return false;
12559 }
12560
12561 /* Parses CONST_STR for branch protection features specified in
12562    aarch64_branch_protect_types, and set any global variables required.  Returns
12563    the parsing result and assigns LAST_STR to the last processed token from
12564    CONST_STR so that it can be used for error reporting.  */
12565
12566 static enum
12567 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
12568                                                           char** last_str)
12569 {
12570   char *str_root = xstrdup (const_str);
12571   char* token_save = NULL;
12572   char *str = strtok_r (str_root, "+", &token_save);
12573   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
12574   if (!str)
12575     res = AARCH64_PARSE_MISSING_ARG;
12576   else
12577     {
12578       char *next_str = strtok_r (NULL, "+", &token_save);
12579       /* Reset the branch protection features to their defaults.  */
12580       aarch64_handle_no_branch_protection (NULL, NULL);
12581
12582       while (str && res == AARCH64_PARSE_OK)
12583         {
12584           const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
12585           bool found = false;
12586           /* Search for this type.  */
12587           while (type && type->name && !found && res == AARCH64_PARSE_OK)
12588             {
12589               if (strcmp (str, type->name) == 0)
12590                 {
12591                   found = true;
12592                   res = type->handler (str, next_str);
12593                   str = next_str;
12594                   next_str = strtok_r (NULL, "+", &token_save);
12595                 }
12596               else
12597                 type++;
12598             }
12599           if (found && res == AARCH64_PARSE_OK)
12600             {
12601               bool found_subtype = true;
12602               /* Loop through each token until we find one that isn't a
12603                  subtype.  */
12604               while (found_subtype)
12605                 {
12606                   found_subtype = false;
12607                   const aarch64_branch_protect_type *subtype = type->subtypes;
12608                   /* Search for the subtype.  */
12609                   while (str && subtype && subtype->name && !found_subtype
12610                           && res == AARCH64_PARSE_OK)
12611                     {
12612                       if (strcmp (str, subtype->name) == 0)
12613                         {
12614                           found_subtype = true;
12615                           res = subtype->handler (str, next_str);
12616                           str = next_str;
12617                           next_str = strtok_r (NULL, "+", &token_save);
12618                         }
12619                       else
12620                         subtype++;
12621                     }
12622                 }
12623             }
12624           else if (!found)
12625             res = AARCH64_PARSE_INVALID_ARG;
12626         }
12627     }
12628   /* Copy the last processed token into the argument to pass it back.
12629     Used by option and attribute validation to print the offending token.  */
12630   if (last_str)
12631     {
12632       if (str) strcpy (*last_str, str);
12633       else *last_str = NULL;
12634     }
12635   if (res == AARCH64_PARSE_OK)
12636     {
12637       /* If needed, alloc the accepted string then copy in const_str.
12638         Used by override_option_after_change_1.  */
12639       if (!accepted_branch_protection_string)
12640         accepted_branch_protection_string = (char *) xmalloc (
12641                                                       BRANCH_PROTECT_STR_MAX
12642                                                         + 1);
12643       strncpy (accepted_branch_protection_string, const_str,
12644                 BRANCH_PROTECT_STR_MAX + 1);
12645       /* Forcibly null-terminate.  */
12646       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
12647     }
12648   return res;
12649 }
12650
12651 static bool
12652 aarch64_validate_mbranch_protection (const char *const_str)
12653 {
12654   char *str = (char *) xmalloc (strlen (const_str));
12655   enum aarch64_parse_opt_result res =
12656     aarch64_parse_branch_protection (const_str, &str);
12657   if (res == AARCH64_PARSE_INVALID_ARG)
12658     error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
12659   else if (res == AARCH64_PARSE_MISSING_ARG)
12660     error ("missing argument for %<-mbranch-protection=%>");
12661   free (str);
12662   return res == AARCH64_PARSE_OK;
12663 }
12664
12665 /* Validate a command-line -march option.  Parse the arch and extensions
12666    (if any) specified in STR and throw errors if appropriate.  Put the
12667    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
12668    option is valid.  */
12669
12670 static bool
12671 aarch64_validate_march (const char *str, const struct processor **res,
12672                          uint64_t *isa_flags)
12673 {
12674   std::string invalid_extension;
12675   enum aarch64_parse_opt_result parse_res
12676     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
12677
12678   if (parse_res == AARCH64_PARSE_OK)
12679     return true;
12680
12681   switch (parse_res)
12682     {
12683       case AARCH64_PARSE_MISSING_ARG:
12684         error ("missing arch name in %<-march=%s%>", str);
12685         break;
12686       case AARCH64_PARSE_INVALID_ARG:
12687         error ("unknown value %qs for %<-march%>", str);
12688         aarch64_print_hint_for_arch (str);
12689         break;
12690       case AARCH64_PARSE_INVALID_FEATURE:
12691         error ("invalid feature modifier %qs in %<-march=%s%>",
12692                invalid_extension.c_str (), str);
12693         aarch64_print_hint_for_extensions (invalid_extension);
12694         break;
12695       default:
12696         gcc_unreachable ();
12697     }
12698
12699   return false;
12700 }
12701
12702 /* Validate a command-line -mtune option.  Parse the cpu
12703    specified in STR and throw errors if appropriate.  Put the
12704    result, if it is valid, in RES.  Return whether the option is
12705    valid.  */
12706
12707 static bool
12708 aarch64_validate_mtune (const char *str, const struct processor **res)
12709 {
12710   enum aarch64_parse_opt_result parse_res
12711     = aarch64_parse_tune (str, res);
12712
12713   if (parse_res == AARCH64_PARSE_OK)
12714     return true;
12715
12716   switch (parse_res)
12717     {
12718       case AARCH64_PARSE_MISSING_ARG:
12719         error ("missing cpu name in %<-mtune=%s%>", str);
12720         break;
12721       case AARCH64_PARSE_INVALID_ARG:
12722         error ("unknown value %qs for %<-mtune%>", str);
12723         aarch64_print_hint_for_core (str);
12724         break;
12725       default:
12726         gcc_unreachable ();
12727     }
12728   return false;
12729 }
12730
12731 /* Return the CPU corresponding to the enum CPU.
12732    If it doesn't specify a cpu, return the default.  */
12733
12734 static const struct processor *
12735 aarch64_get_tune_cpu (enum aarch64_processor cpu)
12736 {
12737   if (cpu != aarch64_none)
12738     return &all_cores[cpu];
12739
12740   /* The & 0x3f is to extract the bottom 6 bits that encode the
12741      default cpu as selected by the --with-cpu GCC configure option
12742      in config.gcc.
12743      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
12744      flags mechanism should be reworked to make it more sane.  */
12745   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12746 }
12747
12748 /* Return the architecture corresponding to the enum ARCH.
12749    If it doesn't specify a valid architecture, return the default.  */
12750
12751 static const struct processor *
12752 aarch64_get_arch (enum aarch64_arch arch)
12753 {
12754   if (arch != aarch64_no_arch)
12755     return &all_architectures[arch];
12756
12757   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12758
12759   return &all_architectures[cpu->arch];
12760 }
12761
12762 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
12763
12764 static poly_uint16
12765 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
12766 {
12767   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
12768      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
12769      deciding which .md file patterns to use and when deciding whether
12770      something is a legitimate address or constant.  */
12771   if (value == SVE_SCALABLE || value == SVE_128)
12772     return poly_uint16 (2, 2);
12773   else
12774     return (int) value / 64;
12775 }
12776
12777 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
12778    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
12779    tuning structs.  In particular it must set selected_tune and
12780    aarch64_isa_flags that define the available ISA features and tuning
12781    decisions.  It must also set selected_arch as this will be used to
12782    output the .arch asm tags for each function.  */
12783
12784 static void
12785 aarch64_override_options (void)
12786 {
12787   uint64_t cpu_isa = 0;
12788   uint64_t arch_isa = 0;
12789   aarch64_isa_flags = 0;
12790
12791   bool valid_cpu = true;
12792   bool valid_tune = true;
12793   bool valid_arch = true;
12794
12795   selected_cpu = NULL;
12796   selected_arch = NULL;
12797   selected_tune = NULL;
12798
12799   if (aarch64_branch_protection_string)
12800     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
12801
12802   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
12803      If either of -march or -mtune is given, they override their
12804      respective component of -mcpu.  */
12805   if (aarch64_cpu_string)
12806     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
12807                                         &cpu_isa);
12808
12809   if (aarch64_arch_string)
12810     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
12811                                           &arch_isa);
12812
12813   if (aarch64_tune_string)
12814     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
12815
12816 #ifdef SUBTARGET_OVERRIDE_OPTIONS
12817   SUBTARGET_OVERRIDE_OPTIONS;
12818 #endif
12819
12820   /* If the user did not specify a processor, choose the default
12821      one for them.  This will be the CPU set during configuration using
12822      --with-cpu, otherwise it is "generic".  */
12823   if (!selected_cpu)
12824     {
12825       if (selected_arch)
12826         {
12827           selected_cpu = &all_cores[selected_arch->ident];
12828           aarch64_isa_flags = arch_isa;
12829           explicit_arch = selected_arch->arch;
12830         }
12831       else
12832         {
12833           /* Get default configure-time CPU.  */
12834           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
12835           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
12836         }
12837
12838       if (selected_tune)
12839         explicit_tune_core = selected_tune->ident;
12840     }
12841   /* If both -mcpu and -march are specified check that they are architecturally
12842      compatible, warn if they're not and prefer the -march ISA flags.  */
12843   else if (selected_arch)
12844     {
12845       if (selected_arch->arch != selected_cpu->arch)
12846         {
12847           warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
12848                        all_architectures[selected_cpu->arch].name,
12849                        selected_arch->name);
12850         }
12851       aarch64_isa_flags = arch_isa;
12852       explicit_arch = selected_arch->arch;
12853       explicit_tune_core = selected_tune ? selected_tune->ident
12854                                           : selected_cpu->ident;
12855     }
12856   else
12857     {
12858       /* -mcpu but no -march.  */
12859       aarch64_isa_flags = cpu_isa;
12860       explicit_tune_core = selected_tune ? selected_tune->ident
12861                                           : selected_cpu->ident;
12862       gcc_assert (selected_cpu);
12863       selected_arch = &all_architectures[selected_cpu->arch];
12864       explicit_arch = selected_arch->arch;
12865     }
12866
12867   /* Set the arch as well as we will need it when outputing
12868      the .arch directive in assembly.  */
12869   if (!selected_arch)
12870     {
12871       gcc_assert (selected_cpu);
12872       selected_arch = &all_architectures[selected_cpu->arch];
12873     }
12874
12875   if (!selected_tune)
12876     selected_tune = selected_cpu;
12877
12878   if (aarch64_enable_bti == 2)
12879     {
12880 #ifdef TARGET_ENABLE_BTI
12881       aarch64_enable_bti = 1;
12882 #else
12883       aarch64_enable_bti = 0;
12884 #endif
12885     }
12886
12887   /* Return address signing is currently not supported for ILP32 targets.  For
12888      LP64 targets use the configured option in the absence of a command-line
12889      option for -mbranch-protection.  */
12890   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
12891     {
12892 #ifdef TARGET_ENABLE_PAC_RET
12893       aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
12894 #else
12895       aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
12896 #endif
12897     }
12898
12899 #ifndef HAVE_AS_MABI_OPTION
12900   /* The compiler may have been configured with 2.23.* binutils, which does
12901      not have support for ILP32.  */
12902   if (TARGET_ILP32)
12903     error ("assembler does not support %<-mabi=ilp32%>");
12904 #endif
12905
12906   /* Convert -msve-vector-bits to a VG count.  */
12907   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
12908
12909   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
12910     sorry ("return address signing is only supported for %<-mabi=lp64%>");
12911
12912   /* Make sure we properly set up the explicit options.  */
12913   if ((aarch64_cpu_string && valid_cpu)
12914        || (aarch64_tune_string && valid_tune))
12915     gcc_assert (explicit_tune_core != aarch64_none);
12916
12917   if ((aarch64_cpu_string && valid_cpu)
12918        || (aarch64_arch_string && valid_arch))
12919     gcc_assert (explicit_arch != aarch64_no_arch);
12920
12921   /* The pass to insert speculation tracking runs before
12922      shrink-wrapping and the latter does not know how to update the
12923      tracking status.  So disable it in this case.  */
12924   if (aarch64_track_speculation)
12925     flag_shrink_wrap = 0;
12926
12927   aarch64_override_options_internal (&global_options);
12928
12929   /* Save these options as the default ones in case we push and pop them later
12930      while processing functions with potential target attributes.  */
12931   target_option_default_node = target_option_current_node
12932       = build_target_option_node (&global_options);
12933 }
12934
12935 /* Implement targetm.override_options_after_change.  */
12936
12937 static void
12938 aarch64_override_options_after_change (void)
12939 {
12940   aarch64_override_options_after_change_1 (&global_options);
12941 }
12942
12943 static struct machine_function *
12944 aarch64_init_machine_status (void)
12945 {
12946   struct machine_function *machine;
12947   machine = ggc_cleared_alloc<machine_function> ();
12948   return machine;
12949 }
12950
12951 void
12952 aarch64_init_expanders (void)
12953 {
12954   init_machine_status = aarch64_init_machine_status;
12955 }
12956
12957 /* A checking mechanism for the implementation of the various code models.  */
12958 static void
12959 initialize_aarch64_code_model (struct gcc_options *opts)
12960 {
12961    if (opts->x_flag_pic)
12962      {
12963        switch (opts->x_aarch64_cmodel_var)
12964          {
12965          case AARCH64_CMODEL_TINY:
12966            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
12967            break;
12968          case AARCH64_CMODEL_SMALL:
12969 #ifdef HAVE_AS_SMALL_PIC_RELOCS
12970            aarch64_cmodel = (flag_pic == 2
12971                              ? AARCH64_CMODEL_SMALL_PIC
12972                              : AARCH64_CMODEL_SMALL_SPIC);
12973 #else
12974            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
12975 #endif
12976            break;
12977          case AARCH64_CMODEL_LARGE:
12978            sorry ("code model %qs with %<-f%s%>", "large",
12979                   opts->x_flag_pic > 1 ? "PIC" : "pic");
12980            break;
12981          default:
12982            gcc_unreachable ();
12983          }
12984      }
12985    else
12986      aarch64_cmodel = opts->x_aarch64_cmodel_var;
12987 }
12988
12989 /* Implement TARGET_OPTION_SAVE.  */
12990
12991 static void
12992 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
12993 {
12994   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
12995   ptr->x_aarch64_branch_protection_string
12996     = opts->x_aarch64_branch_protection_string;
12997 }
12998
12999 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
13000    using the information saved in PTR.  */
13001
13002 static void
13003 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
13004 {
13005   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
13006   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
13007   opts->x_explicit_arch = ptr->x_explicit_arch;
13008   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
13009   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
13010   opts->x_aarch64_branch_protection_string
13011     = ptr->x_aarch64_branch_protection_string;
13012   if (opts->x_aarch64_branch_protection_string)
13013     {
13014       aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
13015                                         NULL);
13016     }
13017
13018   aarch64_override_options_internal (opts);
13019 }
13020
13021 /* Implement TARGET_OPTION_PRINT.  */
13022
13023 static void
13024 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
13025 {
13026   const struct processor *cpu
13027     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
13028   uint64_t isa_flags = ptr->x_aarch64_isa_flags;
13029   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
13030   std::string extension
13031     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
13032
13033   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
13034   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
13035            arch->name, extension.c_str ());
13036 }
13037
13038 static GTY(()) tree aarch64_previous_fndecl;
13039
13040 void
13041 aarch64_reset_previous_fndecl (void)
13042 {
13043   aarch64_previous_fndecl = NULL;
13044 }
13045
13046 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
13047    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
13048    make sure optab availability predicates are recomputed when necessary.  */
13049
13050 void
13051 aarch64_save_restore_target_globals (tree new_tree)
13052 {
13053   if (TREE_TARGET_GLOBALS (new_tree))
13054     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
13055   else if (new_tree == target_option_default_node)
13056     restore_target_globals (&default_target_globals);
13057   else
13058     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
13059 }
13060
13061 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
13062    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
13063    of the function, if such exists.  This function may be called multiple
13064    times on a single function so use aarch64_previous_fndecl to avoid
13065    setting up identical state.  */
13066
13067 static void
13068 aarch64_set_current_function (tree fndecl)
13069 {
13070   if (!fndecl || fndecl == aarch64_previous_fndecl)
13071     return;
13072
13073   tree old_tree = (aarch64_previous_fndecl
13074                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
13075                    : NULL_TREE);
13076
13077   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13078
13079   /* If current function has no attributes but the previous one did,
13080      use the default node.  */
13081   if (!new_tree && old_tree)
13082     new_tree = target_option_default_node;
13083
13084   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
13085      the default have been handled by aarch64_save_restore_target_globals from
13086      aarch64_pragma_target_parse.  */
13087   if (old_tree == new_tree)
13088     return;
13089
13090   aarch64_previous_fndecl = fndecl;
13091
13092   /* First set the target options.  */
13093   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
13094
13095   aarch64_save_restore_target_globals (new_tree);
13096 }
13097
13098 /* Enum describing the various ways we can handle attributes.
13099    In many cases we can reuse the generic option handling machinery.  */
13100
13101 enum aarch64_attr_opt_type
13102 {
13103   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
13104   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
13105   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
13106   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
13107 };
13108
13109 /* All the information needed to handle a target attribute.
13110    NAME is the name of the attribute.
13111    ATTR_TYPE specifies the type of behavior of the attribute as described
13112    in the definition of enum aarch64_attr_opt_type.
13113    ALLOW_NEG is true if the attribute supports a "no-" form.
13114    HANDLER is the function that takes the attribute string as an argument
13115    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
13116    OPT_NUM is the enum specifying the option that the attribute modifies.
13117    This is needed for attributes that mirror the behavior of a command-line
13118    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
13119    aarch64_attr_enum.  */
13120
13121 struct aarch64_attribute_info
13122 {
13123   const char *name;
13124   enum aarch64_attr_opt_type attr_type;
13125   bool allow_neg;
13126   bool (*handler) (const char *);
13127   enum opt_code opt_num;
13128 };
13129
13130 /* Handle the ARCH_STR argument to the arch= target attribute.  */
13131
13132 static bool
13133 aarch64_handle_attr_arch (const char *str)
13134 {
13135   const struct processor *tmp_arch = NULL;
13136   std::string invalid_extension;
13137   enum aarch64_parse_opt_result parse_res
13138     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
13139
13140   if (parse_res == AARCH64_PARSE_OK)
13141     {
13142       gcc_assert (tmp_arch);
13143       selected_arch = tmp_arch;
13144       explicit_arch = selected_arch->arch;
13145       return true;
13146     }
13147
13148   switch (parse_res)
13149     {
13150       case AARCH64_PARSE_MISSING_ARG:
13151         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
13152         break;
13153       case AARCH64_PARSE_INVALID_ARG:
13154         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
13155         aarch64_print_hint_for_arch (str);
13156         break;
13157       case AARCH64_PARSE_INVALID_FEATURE:
13158         error ("invalid feature modifier %s of value (\"%s\") in "
13159                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13160         aarch64_print_hint_for_extensions (invalid_extension);
13161         break;
13162       default:
13163         gcc_unreachable ();
13164     }
13165
13166   return false;
13167 }
13168
13169 /* Handle the argument CPU_STR to the cpu= target attribute.  */
13170
13171 static bool
13172 aarch64_handle_attr_cpu (const char *str)
13173 {
13174   const struct processor *tmp_cpu = NULL;
13175   std::string invalid_extension;
13176   enum aarch64_parse_opt_result parse_res
13177     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
13178
13179   if (parse_res == AARCH64_PARSE_OK)
13180     {
13181       gcc_assert (tmp_cpu);
13182       selected_tune = tmp_cpu;
13183       explicit_tune_core = selected_tune->ident;
13184
13185       selected_arch = &all_architectures[tmp_cpu->arch];
13186       explicit_arch = selected_arch->arch;
13187       return true;
13188     }
13189
13190   switch (parse_res)
13191     {
13192       case AARCH64_PARSE_MISSING_ARG:
13193         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
13194         break;
13195       case AARCH64_PARSE_INVALID_ARG:
13196         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
13197         aarch64_print_hint_for_core (str);
13198         break;
13199       case AARCH64_PARSE_INVALID_FEATURE:
13200         error ("invalid feature modifier %s of value (\"%s\") in "
13201                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13202         aarch64_print_hint_for_extensions (invalid_extension);
13203         break;
13204       default:
13205         gcc_unreachable ();
13206     }
13207
13208   return false;
13209 }
13210
13211 /* Handle the argument STR to the branch-protection= attribute.  */
13212
13213  static bool
13214  aarch64_handle_attr_branch_protection (const char* str)
13215  {
13216   char *err_str = (char *) xmalloc (strlen (str));
13217   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
13218                                                                       &err_str);
13219   bool success = false;
13220   switch (res)
13221     {
13222      case AARCH64_PARSE_MISSING_ARG:
13223        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
13224               " attribute");
13225        break;
13226      case AARCH64_PARSE_INVALID_ARG:
13227        error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
13228               "=\")%> pragma or attribute", err_str);
13229        break;
13230      case AARCH64_PARSE_OK:
13231        success = true;
13232       /* Fall through.  */
13233      case AARCH64_PARSE_INVALID_FEATURE:
13234        break;
13235      default:
13236        gcc_unreachable ();
13237     }
13238   free (err_str);
13239   return success;
13240  }
13241
13242 /* Handle the argument STR to the tune= target attribute.  */
13243
13244 static bool
13245 aarch64_handle_attr_tune (const char *str)
13246 {
13247   const struct processor *tmp_tune = NULL;
13248   enum aarch64_parse_opt_result parse_res
13249     = aarch64_parse_tune (str, &tmp_tune);
13250
13251   if (parse_res == AARCH64_PARSE_OK)
13252     {
13253       gcc_assert (tmp_tune);
13254       selected_tune = tmp_tune;
13255       explicit_tune_core = selected_tune->ident;
13256       return true;
13257     }
13258
13259   switch (parse_res)
13260     {
13261       case AARCH64_PARSE_INVALID_ARG:
13262         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
13263         aarch64_print_hint_for_core (str);
13264         break;
13265       default:
13266         gcc_unreachable ();
13267     }
13268
13269   return false;
13270 }
13271
13272 /* Parse an architecture extensions target attribute string specified in STR.
13273    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
13274    if successful.  Update aarch64_isa_flags to reflect the ISA features
13275    modified.  */
13276
13277 static bool
13278 aarch64_handle_attr_isa_flags (char *str)
13279 {
13280   enum aarch64_parse_opt_result parse_res;
13281   uint64_t isa_flags = aarch64_isa_flags;
13282
13283   /* We allow "+nothing" in the beginning to clear out all architectural
13284      features if the user wants to handpick specific features.  */
13285   if (strncmp ("+nothing", str, 8) == 0)
13286     {
13287       isa_flags = 0;
13288       str += 8;
13289     }
13290
13291   std::string invalid_extension;
13292   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
13293
13294   if (parse_res == AARCH64_PARSE_OK)
13295     {
13296       aarch64_isa_flags = isa_flags;
13297       return true;
13298     }
13299
13300   switch (parse_res)
13301     {
13302       case AARCH64_PARSE_MISSING_ARG:
13303         error ("missing value in %<target()%> pragma or attribute");
13304         break;
13305
13306       case AARCH64_PARSE_INVALID_FEATURE:
13307         error ("invalid feature modifier %s of value (\"%s\") in "
13308                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13309         break;
13310
13311       default:
13312         gcc_unreachable ();
13313     }
13314
13315  return false;
13316 }
13317
13318 /* The target attributes that we support.  On top of these we also support just
13319    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
13320    handled explicitly in aarch64_process_one_target_attr.  */
13321
13322 static const struct aarch64_attribute_info aarch64_attributes[] =
13323 {
13324   { "general-regs-only", aarch64_attr_mask, false, NULL,
13325      OPT_mgeneral_regs_only },
13326   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
13327      OPT_mfix_cortex_a53_835769 },
13328   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
13329      OPT_mfix_cortex_a53_843419 },
13330   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
13331   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
13332   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
13333      OPT_momit_leaf_frame_pointer },
13334   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
13335   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
13336      OPT_march_ },
13337   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
13338   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
13339      OPT_mtune_ },
13340   { "branch-protection", aarch64_attr_custom, false,
13341      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
13342   { "sign-return-address", aarch64_attr_enum, false, NULL,
13343      OPT_msign_return_address_ },
13344   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
13345 };
13346
13347 /* Parse ARG_STR which contains the definition of one target attribute.
13348    Show appropriate errors if any or return true if the attribute is valid.  */
13349
13350 static bool
13351 aarch64_process_one_target_attr (char *arg_str)
13352 {
13353   bool invert = false;
13354
13355   size_t len = strlen (arg_str);
13356
13357   if (len == 0)
13358     {
13359       error ("malformed %<target()%> pragma or attribute");
13360       return false;
13361     }
13362
13363   char *str_to_check = (char *) alloca (len + 1);
13364   strcpy (str_to_check, arg_str);
13365
13366   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
13367      It is easier to detect and handle it explicitly here rather than going
13368      through the machinery for the rest of the target attributes in this
13369      function.  */
13370   if (*str_to_check == '+')
13371     return aarch64_handle_attr_isa_flags (str_to_check);
13372
13373   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
13374     {
13375       invert = true;
13376       str_to_check += 3;
13377     }
13378   char *arg = strchr (str_to_check, '=');
13379
13380   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
13381      and point ARG to "foo".  */
13382   if (arg)
13383     {
13384       *arg = '\0';
13385       arg++;
13386     }
13387   const struct aarch64_attribute_info *p_attr;
13388   bool found = false;
13389   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
13390     {
13391       /* If the names don't match up, or the user has given an argument
13392          to an attribute that doesn't accept one, or didn't give an argument
13393          to an attribute that expects one, fail to match.  */
13394       if (strcmp (str_to_check, p_attr->name) != 0)
13395         continue;
13396
13397       found = true;
13398       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
13399                               || p_attr->attr_type == aarch64_attr_enum;
13400
13401       if (attr_need_arg_p ^ (arg != NULL))
13402         {
13403           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
13404           return false;
13405         }
13406
13407       /* If the name matches but the attribute does not allow "no-" versions
13408          then we can't match.  */
13409       if (invert && !p_attr->allow_neg)
13410         {
13411           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
13412           return false;
13413         }
13414
13415       switch (p_attr->attr_type)
13416         {
13417         /* Has a custom handler registered.
13418            For example, cpu=, arch=, tune=.  */
13419           case aarch64_attr_custom:
13420             gcc_assert (p_attr->handler);
13421             if (!p_attr->handler (arg))
13422               return false;
13423             break;
13424
13425           /* Either set or unset a boolean option.  */
13426           case aarch64_attr_bool:
13427             {
13428               struct cl_decoded_option decoded;
13429
13430               generate_option (p_attr->opt_num, NULL, !invert,
13431                                CL_TARGET, &decoded);
13432               aarch64_handle_option (&global_options, &global_options_set,
13433                                       &decoded, input_location);
13434               break;
13435             }
13436           /* Set or unset a bit in the target_flags.  aarch64_handle_option
13437              should know what mask to apply given the option number.  */
13438           case aarch64_attr_mask:
13439             {
13440               struct cl_decoded_option decoded;
13441               /* We only need to specify the option number.
13442                  aarch64_handle_option will know which mask to apply.  */
13443               decoded.opt_index = p_attr->opt_num;
13444               decoded.value = !invert;
13445               aarch64_handle_option (&global_options, &global_options_set,
13446                                       &decoded, input_location);
13447               break;
13448             }
13449           /* Use the option setting machinery to set an option to an enum.  */
13450           case aarch64_attr_enum:
13451             {
13452               gcc_assert (arg);
13453               bool valid;
13454               int value;
13455               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
13456                                               &value, CL_TARGET);
13457               if (valid)
13458                 {
13459                   set_option (&global_options, NULL, p_attr->opt_num, value,
13460                               NULL, DK_UNSPECIFIED, input_location,
13461                               global_dc);
13462                 }
13463               else
13464                 {
13465                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
13466                 }
13467               break;
13468             }
13469           default:
13470             gcc_unreachable ();
13471         }
13472     }
13473
13474   /* If we reached here we either have found an attribute and validated
13475      it or didn't match any.  If we matched an attribute but its arguments
13476      were malformed we will have returned false already.  */
13477   return found;
13478 }
13479
13480 /* Count how many times the character C appears in
13481    NULL-terminated string STR.  */
13482
13483 static unsigned int
13484 num_occurences_in_str (char c, char *str)
13485 {
13486   unsigned int res = 0;
13487   while (*str != '\0')
13488     {
13489       if (*str == c)
13490         res++;
13491
13492       str++;
13493     }
13494
13495   return res;
13496 }
13497
13498 /* Parse the tree in ARGS that contains the target attribute information
13499    and update the global target options space.  */
13500
13501 bool
13502 aarch64_process_target_attr (tree args)
13503 {
13504   if (TREE_CODE (args) == TREE_LIST)
13505     {
13506       do
13507         {
13508           tree head = TREE_VALUE (args);
13509           if (head)
13510             {
13511               if (!aarch64_process_target_attr (head))
13512                 return false;
13513             }
13514           args = TREE_CHAIN (args);
13515         } while (args);
13516
13517       return true;
13518     }
13519
13520   if (TREE_CODE (args) != STRING_CST)
13521     {
13522       error ("attribute %<target%> argument not a string");
13523       return false;
13524     }
13525
13526   size_t len = strlen (TREE_STRING_POINTER (args));
13527   char *str_to_check = (char *) alloca (len + 1);
13528   strcpy (str_to_check, TREE_STRING_POINTER (args));
13529
13530   if (len == 0)
13531     {
13532       error ("malformed %<target()%> pragma or attribute");
13533       return false;
13534     }
13535
13536   /* Used to catch empty spaces between commas i.e.
13537      attribute ((target ("attr1,,attr2"))).  */
13538   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
13539
13540   /* Handle multiple target attributes separated by ','.  */
13541   char *token = strtok_r (str_to_check, ",", &str_to_check);
13542
13543   unsigned int num_attrs = 0;
13544   while (token)
13545     {
13546       num_attrs++;
13547       if (!aarch64_process_one_target_attr (token))
13548         {
13549           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
13550           return false;
13551         }
13552
13553       token = strtok_r (NULL, ",", &str_to_check);
13554     }
13555
13556   if (num_attrs != num_commas + 1)
13557     {
13558       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
13559       return false;
13560     }
13561
13562   return true;
13563 }
13564
13565 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
13566    process attribute ((target ("..."))).  */
13567
13568 static bool
13569 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
13570 {
13571   struct cl_target_option cur_target;
13572   bool ret;
13573   tree old_optimize;
13574   tree new_target, new_optimize;
13575   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13576
13577   /* If what we're processing is the current pragma string then the
13578      target option node is already stored in target_option_current_node
13579      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
13580      having to re-parse the string.  This is especially useful to keep
13581      arm_neon.h compile times down since that header contains a lot
13582      of intrinsics enclosed in pragmas.  */
13583   if (!existing_target && args == current_target_pragma)
13584     {
13585       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
13586       return true;
13587     }
13588   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13589
13590   old_optimize = build_optimization_node (&global_options);
13591   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13592
13593   /* If the function changed the optimization levels as well as setting
13594      target options, start with the optimizations specified.  */
13595   if (func_optimize && func_optimize != old_optimize)
13596     cl_optimization_restore (&global_options,
13597                              TREE_OPTIMIZATION (func_optimize));
13598
13599   /* Save the current target options to restore at the end.  */
13600   cl_target_option_save (&cur_target, &global_options);
13601
13602   /* If fndecl already has some target attributes applied to it, unpack
13603      them so that we add this attribute on top of them, rather than
13604      overwriting them.  */
13605   if (existing_target)
13606     {
13607       struct cl_target_option *existing_options
13608         = TREE_TARGET_OPTION (existing_target);
13609
13610       if (existing_options)
13611         cl_target_option_restore (&global_options, existing_options);
13612     }
13613   else
13614     cl_target_option_restore (&global_options,
13615                         TREE_TARGET_OPTION (target_option_current_node));
13616
13617   ret = aarch64_process_target_attr (args);
13618
13619   /* Set up any additional state.  */
13620   if (ret)
13621     {
13622       aarch64_override_options_internal (&global_options);
13623       /* Initialize SIMD builtins if we haven't already.
13624          Set current_target_pragma to NULL for the duration so that
13625          the builtin initialization code doesn't try to tag the functions
13626          being built with the attributes specified by any current pragma, thus
13627          going into an infinite recursion.  */
13628       if (TARGET_SIMD)
13629         {
13630           tree saved_current_target_pragma = current_target_pragma;
13631           current_target_pragma = NULL;
13632           aarch64_init_simd_builtins ();
13633           current_target_pragma = saved_current_target_pragma;
13634         }
13635       new_target = build_target_option_node (&global_options);
13636     }
13637   else
13638     new_target = NULL;
13639
13640   new_optimize = build_optimization_node (&global_options);
13641
13642   if (fndecl && ret)
13643     {
13644       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
13645
13646       if (old_optimize != new_optimize)
13647         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
13648     }
13649
13650   cl_target_option_restore (&global_options, &cur_target);
13651
13652   if (old_optimize != new_optimize)
13653     cl_optimization_restore (&global_options,
13654                              TREE_OPTIMIZATION (old_optimize));
13655   return ret;
13656 }
13657
13658 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
13659    tri-bool options (yes, no, don't care) and the default value is
13660    DEF, determine whether to reject inlining.  */
13661
13662 static bool
13663 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
13664                                      int dont_care, int def)
13665 {
13666   /* If the callee doesn't care, always allow inlining.  */
13667   if (callee == dont_care)
13668     return true;
13669
13670   /* If the caller doesn't care, always allow inlining.  */
13671   if (caller == dont_care)
13672     return true;
13673
13674   /* Otherwise, allow inlining if either the callee and caller values
13675      agree, or if the callee is using the default value.  */
13676   return (callee == caller || callee == def);
13677 }
13678
13679 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
13680    to inline CALLEE into CALLER based on target-specific info.
13681    Make sure that the caller and callee have compatible architectural
13682    features.  Then go through the other possible target attributes
13683    and see if they can block inlining.  Try not to reject always_inline
13684    callees unless they are incompatible architecturally.  */
13685
13686 static bool
13687 aarch64_can_inline_p (tree caller, tree callee)
13688 {
13689   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
13690   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
13691
13692   struct cl_target_option *caller_opts
13693         = TREE_TARGET_OPTION (caller_tree ? caller_tree
13694                                            : target_option_default_node);
13695
13696   struct cl_target_option *callee_opts
13697         = TREE_TARGET_OPTION (callee_tree ? callee_tree
13698                                            : target_option_default_node);
13699
13700   /* Callee's ISA flags should be a subset of the caller's.  */
13701   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
13702        != callee_opts->x_aarch64_isa_flags)
13703     return false;
13704
13705   /* Allow non-strict aligned functions inlining into strict
13706      aligned ones.  */
13707   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
13708        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
13709       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
13710            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
13711     return false;
13712
13713   bool always_inline = lookup_attribute ("always_inline",
13714                                           DECL_ATTRIBUTES (callee));
13715
13716   /* If the architectural features match up and the callee is always_inline
13717      then the other attributes don't matter.  */
13718   if (always_inline)
13719     return true;
13720
13721   if (caller_opts->x_aarch64_cmodel_var
13722       != callee_opts->x_aarch64_cmodel_var)
13723     return false;
13724
13725   if (caller_opts->x_aarch64_tls_dialect
13726       != callee_opts->x_aarch64_tls_dialect)
13727     return false;
13728
13729   /* Honour explicit requests to workaround errata.  */
13730   if (!aarch64_tribools_ok_for_inlining_p (
13731           caller_opts->x_aarch64_fix_a53_err835769,
13732           callee_opts->x_aarch64_fix_a53_err835769,
13733           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
13734     return false;
13735
13736   if (!aarch64_tribools_ok_for_inlining_p (
13737           caller_opts->x_aarch64_fix_a53_err843419,
13738           callee_opts->x_aarch64_fix_a53_err843419,
13739           2, TARGET_FIX_ERR_A53_843419))
13740     return false;
13741
13742   /* If the user explicitly specified -momit-leaf-frame-pointer for the
13743      caller and calle and they don't match up, reject inlining.  */
13744   if (!aarch64_tribools_ok_for_inlining_p (
13745           caller_opts->x_flag_omit_leaf_frame_pointer,
13746           callee_opts->x_flag_omit_leaf_frame_pointer,
13747           2, 1))
13748     return false;
13749
13750   /* If the callee has specific tuning overrides, respect them.  */
13751   if (callee_opts->x_aarch64_override_tune_string != NULL
13752       && caller_opts->x_aarch64_override_tune_string == NULL)
13753     return false;
13754
13755   /* If the user specified tuning override strings for the
13756      caller and callee and they don't match up, reject inlining.
13757      We just do a string compare here, we don't analyze the meaning
13758      of the string, as it would be too costly for little gain.  */
13759   if (callee_opts->x_aarch64_override_tune_string
13760       && caller_opts->x_aarch64_override_tune_string
13761       && (strcmp (callee_opts->x_aarch64_override_tune_string,
13762                   caller_opts->x_aarch64_override_tune_string) != 0))
13763     return false;
13764
13765   return true;
13766 }
13767
13768 /* Return true if SYMBOL_REF X binds locally.  */
13769
13770 static bool
13771 aarch64_symbol_binds_local_p (const_rtx x)
13772 {
13773   return (SYMBOL_REF_DECL (x)
13774           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
13775           : SYMBOL_REF_LOCAL_P (x));
13776 }
13777
13778 /* Return true if SYMBOL_REF X is thread local */
13779 static bool
13780 aarch64_tls_symbol_p (rtx x)
13781 {
13782   if (! TARGET_HAVE_TLS)
13783     return false;
13784
13785   if (GET_CODE (x) != SYMBOL_REF)
13786     return false;
13787
13788   return SYMBOL_REF_TLS_MODEL (x) != 0;
13789 }
13790
13791 /* Classify a TLS symbol into one of the TLS kinds.  */
13792 enum aarch64_symbol_type
13793 aarch64_classify_tls_symbol (rtx x)
13794 {
13795   enum tls_model tls_kind = tls_symbolic_operand_type (x);
13796
13797   switch (tls_kind)
13798     {
13799     case TLS_MODEL_GLOBAL_DYNAMIC:
13800     case TLS_MODEL_LOCAL_DYNAMIC:
13801       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
13802
13803     case TLS_MODEL_INITIAL_EXEC:
13804       switch (aarch64_cmodel)
13805         {
13806         case AARCH64_CMODEL_TINY:
13807         case AARCH64_CMODEL_TINY_PIC:
13808           return SYMBOL_TINY_TLSIE;
13809         default:
13810           return SYMBOL_SMALL_TLSIE;
13811         }
13812
13813     case TLS_MODEL_LOCAL_EXEC:
13814       if (aarch64_tls_size == 12)
13815         return SYMBOL_TLSLE12;
13816       else if (aarch64_tls_size == 24)
13817         return SYMBOL_TLSLE24;
13818       else if (aarch64_tls_size == 32)
13819         return SYMBOL_TLSLE32;
13820       else if (aarch64_tls_size == 48)
13821         return SYMBOL_TLSLE48;
13822       else
13823         gcc_unreachable ();
13824
13825     case TLS_MODEL_EMULATED:
13826     case TLS_MODEL_NONE:
13827       return SYMBOL_FORCE_TO_MEM;
13828
13829     default:
13830       gcc_unreachable ();
13831     }
13832 }
13833
13834 /* Return the correct method for accessing X + OFFSET, where X is either
13835    a SYMBOL_REF or LABEL_REF.  */
13836
13837 enum aarch64_symbol_type
13838 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
13839 {
13840   if (GET_CODE (x) == LABEL_REF)
13841     {
13842       switch (aarch64_cmodel)
13843         {
13844         case AARCH64_CMODEL_LARGE:
13845           return SYMBOL_FORCE_TO_MEM;
13846
13847         case AARCH64_CMODEL_TINY_PIC:
13848         case AARCH64_CMODEL_TINY:
13849           return SYMBOL_TINY_ABSOLUTE;
13850
13851         case AARCH64_CMODEL_SMALL_SPIC:
13852         case AARCH64_CMODEL_SMALL_PIC:
13853         case AARCH64_CMODEL_SMALL:
13854           return SYMBOL_SMALL_ABSOLUTE;
13855
13856         default:
13857           gcc_unreachable ();
13858         }
13859     }
13860
13861   if (GET_CODE (x) == SYMBOL_REF)
13862     {
13863       if (aarch64_tls_symbol_p (x))
13864         return aarch64_classify_tls_symbol (x);
13865
13866       switch (aarch64_cmodel)
13867         {
13868         case AARCH64_CMODEL_TINY:
13869           /* When we retrieve symbol + offset address, we have to make sure
13870              the offset does not cause overflow of the final address.  But
13871              we have no way of knowing the address of symbol at compile time
13872              so we can't accurately say if the distance between the PC and
13873              symbol + offset is outside the addressible range of +/-1M in the
13874              TINY code model.  So we rely on images not being greater than
13875              1M and cap the offset at 1M and anything beyond 1M will have to
13876              be loaded using an alternative mechanism.  Furthermore if the
13877              symbol is a weak reference to something that isn't known to
13878              resolve to a symbol in this module, then force to memory.  */
13879           if ((SYMBOL_REF_WEAK (x)
13880                && !aarch64_symbol_binds_local_p (x))
13881               || !IN_RANGE (offset, -1048575, 1048575))
13882             return SYMBOL_FORCE_TO_MEM;
13883           return SYMBOL_TINY_ABSOLUTE;
13884
13885         case AARCH64_CMODEL_SMALL:
13886           /* Same reasoning as the tiny code model, but the offset cap here is
13887              4G.  */
13888           if ((SYMBOL_REF_WEAK (x)
13889                && !aarch64_symbol_binds_local_p (x))
13890               || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
13891                             HOST_WIDE_INT_C (4294967264)))
13892             return SYMBOL_FORCE_TO_MEM;
13893           return SYMBOL_SMALL_ABSOLUTE;
13894
13895         case AARCH64_CMODEL_TINY_PIC:
13896           if (!aarch64_symbol_binds_local_p (x))
13897             return SYMBOL_TINY_GOT;
13898           return SYMBOL_TINY_ABSOLUTE;
13899
13900         case AARCH64_CMODEL_SMALL_SPIC:
13901         case AARCH64_CMODEL_SMALL_PIC:
13902           if (!aarch64_symbol_binds_local_p (x))
13903             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
13904                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
13905           return SYMBOL_SMALL_ABSOLUTE;
13906
13907         case AARCH64_CMODEL_LARGE:
13908           /* This is alright even in PIC code as the constant
13909              pool reference is always PC relative and within
13910              the same translation unit.  */
13911           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
13912             return SYMBOL_SMALL_ABSOLUTE;
13913           else
13914             return SYMBOL_FORCE_TO_MEM;
13915
13916         default:
13917           gcc_unreachable ();
13918         }
13919     }
13920
13921   /* By default push everything into the constant pool.  */
13922   return SYMBOL_FORCE_TO_MEM;
13923 }
13924
13925 bool
13926 aarch64_constant_address_p (rtx x)
13927 {
13928   return (CONSTANT_P (x) && memory_address_p (DImode, x));
13929 }
13930
13931 bool
13932 aarch64_legitimate_pic_operand_p (rtx x)
13933 {
13934   if (GET_CODE (x) == SYMBOL_REF
13935       || (GET_CODE (x) == CONST
13936           && GET_CODE (XEXP (x, 0)) == PLUS
13937           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
13938      return false;
13939
13940   return true;
13941 }
13942
13943 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
13944    that should be rematerialized rather than spilled.  */
13945
13946 static bool
13947 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
13948 {
13949   /* Support CSE and rematerialization of common constants.  */
13950   if (CONST_INT_P (x)
13951       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
13952       || GET_CODE (x) == CONST_VECTOR)
13953     return true;
13954
13955   /* Do not allow vector struct mode constants for Advanced SIMD.
13956      We could support 0 and -1 easily, but they need support in
13957      aarch64-simd.md.  */
13958   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13959   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13960     return false;
13961
13962   /* Only accept variable-length vector constants if they can be
13963      handled directly.
13964
13965      ??? It would be possible to handle rematerialization of other
13966      constants via secondary reloads.  */
13967   if (vec_flags & VEC_ANY_SVE)
13968     return aarch64_simd_valid_immediate (x, NULL);
13969
13970   if (GET_CODE (x) == HIGH)
13971     x = XEXP (x, 0);
13972
13973   /* Accept polynomial constants that can be calculated by using the
13974      destination of a move as the sole temporary.  Constants that
13975      require a second temporary cannot be rematerialized (they can't be
13976      forced to memory and also aren't legitimate constants).  */
13977   poly_int64 offset;
13978   if (poly_int_rtx_p (x, &offset))
13979     return aarch64_offset_temporaries (false, offset) <= 1;
13980
13981   /* If an offset is being added to something else, we need to allow the
13982      base to be moved into the destination register, meaning that there
13983      are no free temporaries for the offset.  */
13984   x = strip_offset (x, &offset);
13985   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
13986     return false;
13987
13988   /* Do not allow const (plus (anchor_symbol, const_int)).  */
13989   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
13990     return false;
13991
13992   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
13993      so spilling them is better than rematerialization.  */
13994   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
13995     return true;
13996
13997   /* Label references are always constant.  */
13998   if (GET_CODE (x) == LABEL_REF)
13999     return true;
14000
14001   return false;
14002 }
14003
14004 rtx
14005 aarch64_load_tp (rtx target)
14006 {
14007   if (!target
14008       || GET_MODE (target) != Pmode
14009       || !register_operand (target, Pmode))
14010     target = gen_reg_rtx (Pmode);
14011
14012   /* Can return in any reg.  */
14013   emit_insn (gen_aarch64_load_tp_hard (target));
14014   return target;
14015 }
14016
14017 /* On AAPCS systems, this is the "struct __va_list".  */
14018 static GTY(()) tree va_list_type;
14019
14020 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
14021    Return the type to use as __builtin_va_list.
14022
14023    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
14024
14025    struct __va_list
14026    {
14027      void *__stack;
14028      void *__gr_top;
14029      void *__vr_top;
14030      int   __gr_offs;
14031      int   __vr_offs;
14032    };  */
14033
14034 static tree
14035 aarch64_build_builtin_va_list (void)
14036 {
14037   tree va_list_name;
14038   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14039
14040   /* Create the type.  */
14041   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
14042   /* Give it the required name.  */
14043   va_list_name = build_decl (BUILTINS_LOCATION,
14044                              TYPE_DECL,
14045                              get_identifier ("__va_list"),
14046                              va_list_type);
14047   DECL_ARTIFICIAL (va_list_name) = 1;
14048   TYPE_NAME (va_list_type) = va_list_name;
14049   TYPE_STUB_DECL (va_list_type) = va_list_name;
14050
14051   /* Create the fields.  */
14052   f_stack = build_decl (BUILTINS_LOCATION,
14053                         FIELD_DECL, get_identifier ("__stack"),
14054                         ptr_type_node);
14055   f_grtop = build_decl (BUILTINS_LOCATION,
14056                         FIELD_DECL, get_identifier ("__gr_top"),
14057                         ptr_type_node);
14058   f_vrtop = build_decl (BUILTINS_LOCATION,
14059                         FIELD_DECL, get_identifier ("__vr_top"),
14060                         ptr_type_node);
14061   f_groff = build_decl (BUILTINS_LOCATION,
14062                         FIELD_DECL, get_identifier ("__gr_offs"),
14063                         integer_type_node);
14064   f_vroff = build_decl (BUILTINS_LOCATION,
14065                         FIELD_DECL, get_identifier ("__vr_offs"),
14066                         integer_type_node);
14067
14068   /* Tell tree-stdarg pass about our internal offset fields.
14069      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
14070      purpose to identify whether the code is updating va_list internal
14071      offset fields through irregular way.  */
14072   va_list_gpr_counter_field = f_groff;
14073   va_list_fpr_counter_field = f_vroff;
14074
14075   DECL_ARTIFICIAL (f_stack) = 1;
14076   DECL_ARTIFICIAL (f_grtop) = 1;
14077   DECL_ARTIFICIAL (f_vrtop) = 1;
14078   DECL_ARTIFICIAL (f_groff) = 1;
14079   DECL_ARTIFICIAL (f_vroff) = 1;
14080
14081   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
14082   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
14083   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
14084   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
14085   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
14086
14087   TYPE_FIELDS (va_list_type) = f_stack;
14088   DECL_CHAIN (f_stack) = f_grtop;
14089   DECL_CHAIN (f_grtop) = f_vrtop;
14090   DECL_CHAIN (f_vrtop) = f_groff;
14091   DECL_CHAIN (f_groff) = f_vroff;
14092
14093   /* Compute its layout.  */
14094   layout_type (va_list_type);
14095
14096   return va_list_type;
14097 }
14098
14099 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
14100 static void
14101 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
14102 {
14103   const CUMULATIVE_ARGS *cum;
14104   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14105   tree stack, grtop, vrtop, groff, vroff;
14106   tree t;
14107   int gr_save_area_size = cfun->va_list_gpr_size;
14108   int vr_save_area_size = cfun->va_list_fpr_size;
14109   int vr_offset;
14110
14111   cum = &crtl->args.info;
14112   if (cfun->va_list_gpr_size)
14113     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
14114                              cfun->va_list_gpr_size);
14115   if (cfun->va_list_fpr_size)
14116     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
14117                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
14118
14119   if (!TARGET_FLOAT)
14120     {
14121       gcc_assert (cum->aapcs_nvrn == 0);
14122       vr_save_area_size = 0;
14123     }
14124
14125   f_stack = TYPE_FIELDS (va_list_type_node);
14126   f_grtop = DECL_CHAIN (f_stack);
14127   f_vrtop = DECL_CHAIN (f_grtop);
14128   f_groff = DECL_CHAIN (f_vrtop);
14129   f_vroff = DECL_CHAIN (f_groff);
14130
14131   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
14132                   NULL_TREE);
14133   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
14134                   NULL_TREE);
14135   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
14136                   NULL_TREE);
14137   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
14138                   NULL_TREE);
14139   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
14140                   NULL_TREE);
14141
14142   /* Emit code to initialize STACK, which points to the next varargs stack
14143      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
14144      by named arguments.  STACK is 8-byte aligned.  */
14145   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
14146   if (cum->aapcs_stack_size > 0)
14147     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
14148   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
14149   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14150
14151   /* Emit code to initialize GRTOP, the top of the GR save area.
14152      virtual_incoming_args_rtx should have been 16 byte aligned.  */
14153   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
14154   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
14155   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14156
14157   /* Emit code to initialize VRTOP, the top of the VR save area.
14158      This address is gr_save_area_bytes below GRTOP, rounded
14159      down to the next 16-byte boundary.  */
14160   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
14161   vr_offset = ROUND_UP (gr_save_area_size,
14162                         STACK_BOUNDARY / BITS_PER_UNIT);
14163
14164   if (vr_offset)
14165     t = fold_build_pointer_plus_hwi (t, -vr_offset);
14166   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
14167   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14168
14169   /* Emit code to initialize GROFF, the offset from GRTOP of the
14170      next GPR argument.  */
14171   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
14172               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
14173   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14174
14175   /* Likewise emit code to initialize VROFF, the offset from FTOP
14176      of the next VR argument.  */
14177   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
14178               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
14179   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14180 }
14181
14182 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
14183
14184 static tree
14185 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
14186                               gimple_seq *post_p ATTRIBUTE_UNUSED)
14187 {
14188   tree addr;
14189   bool indirect_p;
14190   bool is_ha;           /* is HFA or HVA.  */
14191   bool dw_align;        /* double-word align.  */
14192   machine_mode ag_mode = VOIDmode;
14193   int nregs;
14194   machine_mode mode;
14195
14196   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14197   tree stack, f_top, f_off, off, arg, roundup, on_stack;
14198   HOST_WIDE_INT size, rsize, adjust, align;
14199   tree t, u, cond1, cond2;
14200
14201   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
14202   if (indirect_p)
14203     type = build_pointer_type (type);
14204
14205   mode = TYPE_MODE (type);
14206
14207   f_stack = TYPE_FIELDS (va_list_type_node);
14208   f_grtop = DECL_CHAIN (f_stack);
14209   f_vrtop = DECL_CHAIN (f_grtop);
14210   f_groff = DECL_CHAIN (f_vrtop);
14211   f_vroff = DECL_CHAIN (f_groff);
14212
14213   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
14214                   f_stack, NULL_TREE);
14215   size = int_size_in_bytes (type);
14216
14217   bool abi_break;
14218   align
14219     = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
14220
14221   dw_align = false;
14222   adjust = 0;
14223   if (aarch64_vfp_is_call_or_return_candidate (mode,
14224                                                type,
14225                                                &ag_mode,
14226                                                &nregs,
14227                                                &is_ha))
14228     {
14229       /* No frontends can create types with variable-sized modes, so we
14230          shouldn't be asked to pass or return them.  */
14231       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
14232
14233       /* TYPE passed in fp/simd registers.  */
14234       if (!TARGET_FLOAT)
14235         aarch64_err_no_fpadvsimd (mode);
14236
14237       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
14238                       unshare_expr (valist), f_vrtop, NULL_TREE);
14239       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
14240                       unshare_expr (valist), f_vroff, NULL_TREE);
14241
14242       rsize = nregs * UNITS_PER_VREG;
14243
14244       if (is_ha)
14245         {
14246           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
14247             adjust = UNITS_PER_VREG - ag_size;
14248         }
14249       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14250                && size < UNITS_PER_VREG)
14251         {
14252           adjust = UNITS_PER_VREG - size;
14253         }
14254     }
14255   else
14256     {
14257       /* TYPE passed in general registers.  */
14258       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
14259                       unshare_expr (valist), f_grtop, NULL_TREE);
14260       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
14261                       unshare_expr (valist), f_groff, NULL_TREE);
14262       rsize = ROUND_UP (size, UNITS_PER_WORD);
14263       nregs = rsize / UNITS_PER_WORD;
14264
14265       if (align > 8)
14266         {
14267           if (abi_break && warn_psabi)
14268             inform (input_location, "parameter passing for argument of type "
14269                     "%qT changed in GCC 9.1", type);
14270           dw_align = true;
14271         }
14272
14273       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14274           && size < UNITS_PER_WORD)
14275         {
14276           adjust = UNITS_PER_WORD  - size;
14277         }
14278     }
14279
14280   /* Get a local temporary for the field value.  */
14281   off = get_initialized_tmp_var (f_off, pre_p, NULL);
14282
14283   /* Emit code to branch if off >= 0.  */
14284   t = build2 (GE_EXPR, boolean_type_node, off,
14285               build_int_cst (TREE_TYPE (off), 0));
14286   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
14287
14288   if (dw_align)
14289     {
14290       /* Emit: offs = (offs + 15) & -16.  */
14291       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14292                   build_int_cst (TREE_TYPE (off), 15));
14293       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
14294                   build_int_cst (TREE_TYPE (off), -16));
14295       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
14296     }
14297   else
14298     roundup = NULL;
14299
14300   /* Update ap.__[g|v]r_offs  */
14301   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14302               build_int_cst (TREE_TYPE (off), rsize));
14303   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
14304
14305   /* String up.  */
14306   if (roundup)
14307     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14308
14309   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
14310   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
14311               build_int_cst (TREE_TYPE (f_off), 0));
14312   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
14313
14314   /* String up: make sure the assignment happens before the use.  */
14315   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
14316   COND_EXPR_ELSE (cond1) = t;
14317
14318   /* Prepare the trees handling the argument that is passed on the stack;
14319      the top level node will store in ON_STACK.  */
14320   arg = get_initialized_tmp_var (stack, pre_p, NULL);
14321   if (align > 8)
14322     {
14323       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
14324       t = fold_build_pointer_plus_hwi (arg, 15);
14325       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14326                   build_int_cst (TREE_TYPE (t), -16));
14327       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
14328     }
14329   else
14330     roundup = NULL;
14331   /* Advance ap.__stack  */
14332   t = fold_build_pointer_plus_hwi (arg, size + 7);
14333   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14334               build_int_cst (TREE_TYPE (t), -8));
14335   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
14336   /* String up roundup and advance.  */
14337   if (roundup)
14338     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14339   /* String up with arg */
14340   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
14341   /* Big-endianness related address adjustment.  */
14342   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14343       && size < UNITS_PER_WORD)
14344   {
14345     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
14346                 size_int (UNITS_PER_WORD - size));
14347     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
14348   }
14349
14350   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
14351   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
14352
14353   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
14354   t = off;
14355   if (adjust)
14356     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
14357                 build_int_cst (TREE_TYPE (off), adjust));
14358
14359   t = fold_convert (sizetype, t);
14360   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
14361
14362   if (is_ha)
14363     {
14364       /* type ha; // treat as "struct {ftype field[n];}"
14365          ... [computing offs]
14366          for (i = 0; i <nregs; ++i, offs += 16)
14367            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
14368          return ha;  */
14369       int i;
14370       tree tmp_ha, field_t, field_ptr_t;
14371
14372       /* Declare a local variable.  */
14373       tmp_ha = create_tmp_var_raw (type, "ha");
14374       gimple_add_tmp_var (tmp_ha);
14375
14376       /* Establish the base type.  */
14377       switch (ag_mode)
14378         {
14379         case E_SFmode:
14380           field_t = float_type_node;
14381           field_ptr_t = float_ptr_type_node;
14382           break;
14383         case E_DFmode:
14384           field_t = double_type_node;
14385           field_ptr_t = double_ptr_type_node;
14386           break;
14387         case E_TFmode:
14388           field_t = long_double_type_node;
14389           field_ptr_t = long_double_ptr_type_node;
14390           break;
14391         case E_HFmode:
14392           field_t = aarch64_fp16_type_node;
14393           field_ptr_t = aarch64_fp16_ptr_type_node;
14394           break;
14395         case E_V2SImode:
14396         case E_V4SImode:
14397             {
14398               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
14399               field_t = build_vector_type_for_mode (innertype, ag_mode);
14400               field_ptr_t = build_pointer_type (field_t);
14401             }
14402           break;
14403         default:
14404           gcc_assert (0);
14405         }
14406
14407       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
14408       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
14409       addr = t;
14410       t = fold_convert (field_ptr_t, addr);
14411       t = build2 (MODIFY_EXPR, field_t,
14412                   build1 (INDIRECT_REF, field_t, tmp_ha),
14413                   build1 (INDIRECT_REF, field_t, t));
14414
14415       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
14416       for (i = 1; i < nregs; ++i)
14417         {
14418           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
14419           u = fold_convert (field_ptr_t, addr);
14420           u = build2 (MODIFY_EXPR, field_t,
14421                       build2 (MEM_REF, field_t, tmp_ha,
14422                               build_int_cst (field_ptr_t,
14423                                              (i *
14424                                               int_size_in_bytes (field_t)))),
14425                       build1 (INDIRECT_REF, field_t, u));
14426           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
14427         }
14428
14429       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
14430       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
14431     }
14432
14433   COND_EXPR_ELSE (cond2) = t;
14434   addr = fold_convert (build_pointer_type (type), cond1);
14435   addr = build_va_arg_indirect_ref (addr);
14436
14437   if (indirect_p)
14438     addr = build_va_arg_indirect_ref (addr);
14439
14440   return addr;
14441 }
14442
14443 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
14444
14445 static void
14446 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
14447                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
14448                                 int no_rtl)
14449 {
14450   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
14451   CUMULATIVE_ARGS local_cum;
14452   int gr_saved = cfun->va_list_gpr_size;
14453   int vr_saved = cfun->va_list_fpr_size;
14454
14455   /* The caller has advanced CUM up to, but not beyond, the last named
14456      argument.  Advance a local copy of CUM past the last "real" named
14457      argument, to find out how many registers are left over.  */
14458   local_cum = *cum;
14459   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
14460
14461   /* Found out how many registers we need to save.
14462      Honor tree-stdvar analysis results.  */
14463   if (cfun->va_list_gpr_size)
14464     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
14465                     cfun->va_list_gpr_size / UNITS_PER_WORD);
14466   if (cfun->va_list_fpr_size)
14467     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
14468                     cfun->va_list_fpr_size / UNITS_PER_VREG);
14469
14470   if (!TARGET_FLOAT)
14471     {
14472       gcc_assert (local_cum.aapcs_nvrn == 0);
14473       vr_saved = 0;
14474     }
14475
14476   if (!no_rtl)
14477     {
14478       if (gr_saved > 0)
14479         {
14480           rtx ptr, mem;
14481
14482           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
14483           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
14484                                - gr_saved * UNITS_PER_WORD);
14485           mem = gen_frame_mem (BLKmode, ptr);
14486           set_mem_alias_set (mem, get_varargs_alias_set ());
14487
14488           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
14489                                mem, gr_saved);
14490         }
14491       if (vr_saved > 0)
14492         {
14493           /* We can't use move_block_from_reg, because it will use
14494              the wrong mode, storing D regs only.  */
14495           machine_mode mode = TImode;
14496           int off, i, vr_start;
14497
14498           /* Set OFF to the offset from virtual_incoming_args_rtx of
14499              the first vector register.  The VR save area lies below
14500              the GR one, and is aligned to 16 bytes.  */
14501           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
14502                            STACK_BOUNDARY / BITS_PER_UNIT);
14503           off -= vr_saved * UNITS_PER_VREG;
14504
14505           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
14506           for (i = 0; i < vr_saved; ++i)
14507             {
14508               rtx ptr, mem;
14509
14510               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
14511               mem = gen_frame_mem (mode, ptr);
14512               set_mem_alias_set (mem, get_varargs_alias_set ());
14513               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
14514               off += UNITS_PER_VREG;
14515             }
14516         }
14517     }
14518
14519   /* We don't save the size into *PRETEND_SIZE because we want to avoid
14520      any complication of having crtl->args.pretend_args_size changed.  */
14521   cfun->machine->frame.saved_varargs_size
14522     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
14523                  STACK_BOUNDARY / BITS_PER_UNIT)
14524        + vr_saved * UNITS_PER_VREG);
14525 }
14526
14527 static void
14528 aarch64_conditional_register_usage (void)
14529 {
14530   int i;
14531   if (!TARGET_FLOAT)
14532     {
14533       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
14534         {
14535           fixed_regs[i] = 1;
14536           call_used_regs[i] = 1;
14537         }
14538     }
14539   if (!TARGET_SVE)
14540     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
14541       {
14542         fixed_regs[i] = 1;
14543         call_used_regs[i] = 1;
14544       }
14545
14546   /* When tracking speculation, we need a couple of call-clobbered registers
14547      to track the speculation state.  It would be nice to just use
14548      IP0 and IP1, but currently there are numerous places that just
14549      assume these registers are free for other uses (eg pointer
14550      authentication).  */
14551   if (aarch64_track_speculation)
14552     {
14553       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
14554       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
14555       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14556       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14557     }
14558 }
14559
14560 /* Walk down the type tree of TYPE counting consecutive base elements.
14561    If *MODEP is VOIDmode, then set it to the first valid floating point
14562    type.  If a non-floating point type is found, or if a floating point
14563    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
14564    otherwise return the count in the sub-tree.  */
14565 static int
14566 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
14567 {
14568   machine_mode mode;
14569   HOST_WIDE_INT size;
14570
14571   switch (TREE_CODE (type))
14572     {
14573     case REAL_TYPE:
14574       mode = TYPE_MODE (type);
14575       if (mode != DFmode && mode != SFmode
14576           && mode != TFmode && mode != HFmode)
14577         return -1;
14578
14579       if (*modep == VOIDmode)
14580         *modep = mode;
14581
14582       if (*modep == mode)
14583         return 1;
14584
14585       break;
14586
14587     case COMPLEX_TYPE:
14588       mode = TYPE_MODE (TREE_TYPE (type));
14589       if (mode != DFmode && mode != SFmode
14590           && mode != TFmode && mode != HFmode)
14591         return -1;
14592
14593       if (*modep == VOIDmode)
14594         *modep = mode;
14595
14596       if (*modep == mode)
14597         return 2;
14598
14599       break;
14600
14601     case VECTOR_TYPE:
14602       /* Use V2SImode and V4SImode as representatives of all 64-bit
14603          and 128-bit vector types.  */
14604       size = int_size_in_bytes (type);
14605       switch (size)
14606         {
14607         case 8:
14608           mode = V2SImode;
14609           break;
14610         case 16:
14611           mode = V4SImode;
14612           break;
14613         default:
14614           return -1;
14615         }
14616
14617       if (*modep == VOIDmode)
14618         *modep = mode;
14619
14620       /* Vector modes are considered to be opaque: two vectors are
14621          equivalent for the purposes of being homogeneous aggregates
14622          if they are the same size.  */
14623       if (*modep == mode)
14624         return 1;
14625
14626       break;
14627
14628     case ARRAY_TYPE:
14629       {
14630         int count;
14631         tree index = TYPE_DOMAIN (type);
14632
14633         /* Can't handle incomplete types nor sizes that are not
14634            fixed.  */
14635         if (!COMPLETE_TYPE_P (type)
14636             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14637           return -1;
14638
14639         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
14640         if (count == -1
14641             || !index
14642             || !TYPE_MAX_VALUE (index)
14643             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
14644             || !TYPE_MIN_VALUE (index)
14645             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
14646             || count < 0)
14647           return -1;
14648
14649         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
14650                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
14651
14652         /* There must be no padding.  */
14653         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14654                       count * GET_MODE_BITSIZE (*modep)))
14655           return -1;
14656
14657         return count;
14658       }
14659
14660     case RECORD_TYPE:
14661       {
14662         int count = 0;
14663         int sub_count;
14664         tree field;
14665
14666         /* Can't handle incomplete types nor sizes that are not
14667            fixed.  */
14668         if (!COMPLETE_TYPE_P (type)
14669             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14670           return -1;
14671
14672         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14673           {
14674             if (TREE_CODE (field) != FIELD_DECL)
14675               continue;
14676
14677             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14678             if (sub_count < 0)
14679               return -1;
14680             count += sub_count;
14681           }
14682
14683         /* There must be no padding.  */
14684         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14685                       count * GET_MODE_BITSIZE (*modep)))
14686           return -1;
14687
14688         return count;
14689       }
14690
14691     case UNION_TYPE:
14692     case QUAL_UNION_TYPE:
14693       {
14694         /* These aren't very interesting except in a degenerate case.  */
14695         int count = 0;
14696         int sub_count;
14697         tree field;
14698
14699         /* Can't handle incomplete types nor sizes that are not
14700            fixed.  */
14701         if (!COMPLETE_TYPE_P (type)
14702             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14703           return -1;
14704
14705         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14706           {
14707             if (TREE_CODE (field) != FIELD_DECL)
14708               continue;
14709
14710             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14711             if (sub_count < 0)
14712               return -1;
14713             count = count > sub_count ? count : sub_count;
14714           }
14715
14716         /* There must be no padding.  */
14717         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14718                       count * GET_MODE_BITSIZE (*modep)))
14719           return -1;
14720
14721         return count;
14722       }
14723
14724     default:
14725       break;
14726     }
14727
14728   return -1;
14729 }
14730
14731 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
14732    type as described in AAPCS64 \S 4.1.2.
14733
14734    See the comment above aarch64_composite_type_p for the notes on MODE.  */
14735
14736 static bool
14737 aarch64_short_vector_p (const_tree type,
14738                         machine_mode mode)
14739 {
14740   poly_int64 size = -1;
14741
14742   if (type && TREE_CODE (type) == VECTOR_TYPE)
14743     size = int_size_in_bytes (type);
14744   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
14745             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
14746     size = GET_MODE_SIZE (mode);
14747
14748   return known_eq (size, 8) || known_eq (size, 16);
14749 }
14750
14751 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
14752    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
14753    array types.  The C99 floating-point complex types are also considered
14754    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
14755    types, which are GCC extensions and out of the scope of AAPCS64, are
14756    treated as composite types here as well.
14757
14758    Note that MODE itself is not sufficient in determining whether a type
14759    is such a composite type or not.  This is because
14760    stor-layout.c:compute_record_mode may have already changed the MODE
14761    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
14762    structure with only one field may have its MODE set to the mode of the
14763    field.  Also an integer mode whose size matches the size of the
14764    RECORD_TYPE type may be used to substitute the original mode
14765    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
14766    solely relied on.  */
14767
14768 static bool
14769 aarch64_composite_type_p (const_tree type,
14770                           machine_mode mode)
14771 {
14772   if (aarch64_short_vector_p (type, mode))
14773     return false;
14774
14775   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
14776     return true;
14777
14778   if (mode == BLKmode
14779       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
14780       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
14781     return true;
14782
14783   return false;
14784 }
14785
14786 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
14787    shall be passed or returned in simd/fp register(s) (providing these
14788    parameter passing registers are available).
14789
14790    Upon successful return, *COUNT returns the number of needed registers,
14791    *BASE_MODE returns the mode of the individual register and when IS_HAF
14792    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
14793    floating-point aggregate or a homogeneous short-vector aggregate.  */
14794
14795 static bool
14796 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
14797                                          const_tree type,
14798                                          machine_mode *base_mode,
14799                                          int *count,
14800                                          bool *is_ha)
14801 {
14802   machine_mode new_mode = VOIDmode;
14803   bool composite_p = aarch64_composite_type_p (type, mode);
14804
14805   if (is_ha != NULL) *is_ha = false;
14806
14807   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
14808       || aarch64_short_vector_p (type, mode))
14809     {
14810       *count = 1;
14811       new_mode = mode;
14812     }
14813   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
14814     {
14815       if (is_ha != NULL) *is_ha = true;
14816       *count = 2;
14817       new_mode = GET_MODE_INNER (mode);
14818     }
14819   else if (type && composite_p)
14820     {
14821       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
14822
14823       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
14824         {
14825           if (is_ha != NULL) *is_ha = true;
14826           *count = ag_count;
14827         }
14828       else
14829         return false;
14830     }
14831   else
14832     return false;
14833
14834   *base_mode = new_mode;
14835   return true;
14836 }
14837
14838 /* Implement TARGET_STRUCT_VALUE_RTX.  */
14839
14840 static rtx
14841 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
14842                           int incoming ATTRIBUTE_UNUSED)
14843 {
14844   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
14845 }
14846
14847 /* Implements target hook vector_mode_supported_p.  */
14848 static bool
14849 aarch64_vector_mode_supported_p (machine_mode mode)
14850 {
14851   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14852   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
14853 }
14854
14855 /* Return the full-width SVE vector mode for element mode MODE, if one
14856    exists.  */
14857 opt_machine_mode
14858 aarch64_full_sve_mode (scalar_mode mode)
14859 {
14860   switch (mode)
14861     {
14862     case E_DFmode:
14863       return VNx2DFmode;
14864     case E_SFmode:
14865       return VNx4SFmode;
14866     case E_HFmode:
14867       return VNx8HFmode;
14868     case E_DImode:
14869         return VNx2DImode;
14870     case E_SImode:
14871       return VNx4SImode;
14872     case E_HImode:
14873       return VNx8HImode;
14874     case E_QImode:
14875       return VNx16QImode;
14876     default:
14877       return opt_machine_mode ();
14878     }
14879 }
14880
14881 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
14882    if it exists.  */
14883 opt_machine_mode
14884 aarch64_vq_mode (scalar_mode mode)
14885 {
14886   switch (mode)
14887     {
14888     case E_DFmode:
14889       return V2DFmode;
14890     case E_SFmode:
14891       return V4SFmode;
14892     case E_HFmode:
14893       return V8HFmode;
14894     case E_SImode:
14895       return V4SImode;
14896     case E_HImode:
14897       return V8HImode;
14898     case E_QImode:
14899       return V16QImode;
14900     case E_DImode:
14901       return V2DImode;
14902     default:
14903       return opt_machine_mode ();
14904     }
14905 }
14906
14907 /* Return appropriate SIMD container
14908    for MODE within a vector of WIDTH bits.  */
14909 static machine_mode
14910 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
14911 {
14912   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
14913     return aarch64_full_sve_mode (mode).else_mode (word_mode);
14914
14915   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
14916   if (TARGET_SIMD)
14917     {
14918       if (known_eq (width, 128))
14919         return aarch64_vq_mode (mode).else_mode (word_mode);
14920       else
14921         switch (mode)
14922           {
14923           case E_SFmode:
14924             return V2SFmode;
14925           case E_HFmode:
14926             return V4HFmode;
14927           case E_SImode:
14928             return V2SImode;
14929           case E_HImode:
14930             return V4HImode;
14931           case E_QImode:
14932             return V8QImode;
14933           default:
14934             break;
14935           }
14936     }
14937   return word_mode;
14938 }
14939
14940 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
14941 static machine_mode
14942 aarch64_preferred_simd_mode (scalar_mode mode)
14943 {
14944   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
14945   return aarch64_simd_container_mode (mode, bits);
14946 }
14947
14948 /* Return a list of possible vector sizes for the vectorizer
14949    to iterate over.  */
14950 static void
14951 aarch64_autovectorize_vector_sizes (vector_sizes *sizes, bool)
14952 {
14953   if (TARGET_SVE)
14954     sizes->safe_push (BYTES_PER_SVE_VECTOR);
14955   sizes->safe_push (16);
14956   sizes->safe_push (8);
14957 }
14958
14959 /* Implement TARGET_MANGLE_TYPE.  */
14960
14961 static const char *
14962 aarch64_mangle_type (const_tree type)
14963 {
14964   /* The AArch64 ABI documents say that "__va_list" has to be
14965      mangled as if it is in the "std" namespace.  */
14966   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
14967     return "St9__va_list";
14968
14969   /* Half-precision float.  */
14970   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
14971     return "Dh";
14972
14973   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
14974      builtin types.  */
14975   if (TYPE_NAME (type) != NULL)
14976     return aarch64_mangle_builtin_type (type);
14977
14978   /* Use the default mangling.  */
14979   return NULL;
14980 }
14981
14982 /* Find the first rtx_insn before insn that will generate an assembly
14983    instruction.  */
14984
14985 static rtx_insn *
14986 aarch64_prev_real_insn (rtx_insn *insn)
14987 {
14988   if (!insn)
14989     return NULL;
14990
14991   do
14992     {
14993       insn = prev_real_insn (insn);
14994     }
14995   while (insn && recog_memoized (insn) < 0);
14996
14997   return insn;
14998 }
14999
15000 static bool
15001 is_madd_op (enum attr_type t1)
15002 {
15003   unsigned int i;
15004   /* A number of these may be AArch32 only.  */
15005   enum attr_type mlatypes[] = {
15006     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
15007     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
15008     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
15009   };
15010
15011   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
15012     {
15013       if (t1 == mlatypes[i])
15014         return true;
15015     }
15016
15017   return false;
15018 }
15019
15020 /* Check if there is a register dependency between a load and the insn
15021    for which we hold recog_data.  */
15022
15023 static bool
15024 dep_between_memop_and_curr (rtx memop)
15025 {
15026   rtx load_reg;
15027   int opno;
15028
15029   gcc_assert (GET_CODE (memop) == SET);
15030
15031   if (!REG_P (SET_DEST (memop)))
15032     return false;
15033
15034   load_reg = SET_DEST (memop);
15035   for (opno = 1; opno < recog_data.n_operands; opno++)
15036     {
15037       rtx operand = recog_data.operand[opno];
15038       if (REG_P (operand)
15039           && reg_overlap_mentioned_p (load_reg, operand))
15040         return true;
15041
15042     }
15043   return false;
15044 }
15045
15046
15047 /* When working around the Cortex-A53 erratum 835769,
15048    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
15049    instruction and has a preceding memory instruction such that a NOP
15050    should be inserted between them.  */
15051
15052 bool
15053 aarch64_madd_needs_nop (rtx_insn* insn)
15054 {
15055   enum attr_type attr_type;
15056   rtx_insn *prev;
15057   rtx body;
15058
15059   if (!TARGET_FIX_ERR_A53_835769)
15060     return false;
15061
15062   if (!INSN_P (insn) || recog_memoized (insn) < 0)
15063     return false;
15064
15065   attr_type = get_attr_type (insn);
15066   if (!is_madd_op (attr_type))
15067     return false;
15068
15069   prev = aarch64_prev_real_insn (insn);
15070   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
15071      Restore recog state to INSN to avoid state corruption.  */
15072   extract_constrain_insn_cached (insn);
15073
15074   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
15075     return false;
15076
15077   body = single_set (prev);
15078
15079   /* If the previous insn is a memory op and there is no dependency between
15080      it and the DImode madd, emit a NOP between them.  If body is NULL then we
15081      have a complex memory operation, probably a load/store pair.
15082      Be conservative for now and emit a NOP.  */
15083   if (GET_MODE (recog_data.operand[0]) == DImode
15084       && (!body || !dep_between_memop_and_curr (body)))
15085     return true;
15086
15087   return false;
15088
15089 }
15090
15091
15092 /* Implement FINAL_PRESCAN_INSN.  */
15093
15094 void
15095 aarch64_final_prescan_insn (rtx_insn *insn)
15096 {
15097   if (aarch64_madd_needs_nop (insn))
15098     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
15099 }
15100
15101
15102 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
15103    instruction.  */
15104
15105 bool
15106 aarch64_sve_index_immediate_p (rtx base_or_step)
15107 {
15108   return (CONST_INT_P (base_or_step)
15109           && IN_RANGE (INTVAL (base_or_step), -16, 15));
15110 }
15111
15112 /* Return true if X is a valid immediate for the SVE ADD and SUB
15113    instructions.  Negate X first if NEGATE_P is true.  */
15114
15115 bool
15116 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
15117 {
15118   rtx elt;
15119
15120   if (!const_vec_duplicate_p (x, &elt)
15121       || !CONST_INT_P (elt))
15122     return false;
15123
15124   HOST_WIDE_INT val = INTVAL (elt);
15125   if (negate_p)
15126     val = -val;
15127   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
15128
15129   if (val & 0xff)
15130     return IN_RANGE (val, 0, 0xff);
15131   return IN_RANGE (val, 0, 0xff00);
15132 }
15133
15134 /* Return true if X is a valid immediate operand for an SVE logical
15135    instruction such as AND.  */
15136
15137 bool
15138 aarch64_sve_bitmask_immediate_p (rtx x)
15139 {
15140   rtx elt;
15141
15142   return (const_vec_duplicate_p (x, &elt)
15143           && CONST_INT_P (elt)
15144           && aarch64_bitmask_imm (INTVAL (elt),
15145                                   GET_MODE_INNER (GET_MODE (x))));
15146 }
15147
15148 /* Return true if X is a valid immediate for the SVE DUP and CPY
15149    instructions.  */
15150
15151 bool
15152 aarch64_sve_dup_immediate_p (rtx x)
15153 {
15154   x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
15155   if (!CONST_INT_P (x))
15156     return false;
15157
15158   HOST_WIDE_INT val = INTVAL (x);
15159   if (val & 0xff)
15160     return IN_RANGE (val, -0x80, 0x7f);
15161   return IN_RANGE (val, -0x8000, 0x7f00);
15162 }
15163
15164 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
15165    SIGNED_P says whether the operand is signed rather than unsigned.  */
15166
15167 bool
15168 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
15169 {
15170   rtx elt;
15171
15172   return (const_vec_duplicate_p (x, &elt)
15173           && CONST_INT_P (elt)
15174           && (signed_p
15175               ? IN_RANGE (INTVAL (elt), -16, 15)
15176               : IN_RANGE (INTVAL (elt), 0, 127)));
15177 }
15178
15179 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
15180    instruction.  Negate X first if NEGATE_P is true.  */
15181
15182 bool
15183 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
15184 {
15185   rtx elt;
15186   REAL_VALUE_TYPE r;
15187
15188   if (!const_vec_duplicate_p (x, &elt)
15189       || GET_CODE (elt) != CONST_DOUBLE)
15190     return false;
15191
15192   r = *CONST_DOUBLE_REAL_VALUE (elt);
15193
15194   if (negate_p)
15195     r = real_value_negate (&r);
15196
15197   if (real_equal (&r, &dconst1))
15198     return true;
15199   if (real_equal (&r, &dconsthalf))
15200     return true;
15201   return false;
15202 }
15203
15204 /* Return true if X is a valid immediate operand for an SVE FMUL
15205    instruction.  */
15206
15207 bool
15208 aarch64_sve_float_mul_immediate_p (rtx x)
15209 {
15210   rtx elt;
15211
15212   /* GCC will never generate a multiply with an immediate of 2, so there is no
15213      point testing for it (even though it is a valid constant).  */
15214   return (const_vec_duplicate_p (x, &elt)
15215           && GET_CODE (elt) == CONST_DOUBLE
15216           && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
15217 }
15218
15219 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
15220    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
15221    is nonnull, use it to describe valid immediates.  */
15222 static bool
15223 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
15224                                     simd_immediate_info *info,
15225                                     enum simd_immediate_check which,
15226                                     simd_immediate_info::insn_type insn)
15227 {
15228   /* Try a 4-byte immediate with LSL.  */
15229   for (unsigned int shift = 0; shift < 32; shift += 8)
15230     if ((val32 & (0xff << shift)) == val32)
15231       {
15232         if (info)
15233           *info = simd_immediate_info (SImode, val32 >> shift, insn,
15234                                        simd_immediate_info::LSL, shift);
15235         return true;
15236       }
15237
15238   /* Try a 2-byte immediate with LSL.  */
15239   unsigned int imm16 = val32 & 0xffff;
15240   if (imm16 == (val32 >> 16))
15241     for (unsigned int shift = 0; shift < 16; shift += 8)
15242       if ((imm16 & (0xff << shift)) == imm16)
15243         {
15244           if (info)
15245             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
15246                                          simd_immediate_info::LSL, shift);
15247           return true;
15248         }
15249
15250   /* Try a 4-byte immediate with MSL, except for cases that MVN
15251      can handle.  */
15252   if (which == AARCH64_CHECK_MOV)
15253     for (unsigned int shift = 8; shift < 24; shift += 8)
15254       {
15255         unsigned int low = (1 << shift) - 1;
15256         if (((val32 & (0xff << shift)) | low) == val32)
15257           {
15258             if (info)
15259               *info = simd_immediate_info (SImode, val32 >> shift, insn,
15260                                            simd_immediate_info::MSL, shift);
15261             return true;
15262           }
15263       }
15264
15265   return false;
15266 }
15267
15268 /* Return true if replicating VAL64 is a valid immediate for the
15269    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
15270    use it to describe valid immediates.  */
15271 static bool
15272 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
15273                                  simd_immediate_info *info,
15274                                  enum simd_immediate_check which)
15275 {
15276   unsigned int val32 = val64 & 0xffffffff;
15277   unsigned int val16 = val64 & 0xffff;
15278   unsigned int val8 = val64 & 0xff;
15279
15280   if (val32 == (val64 >> 32))
15281     {
15282       if ((which & AARCH64_CHECK_ORR) != 0
15283           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
15284                                                  simd_immediate_info::MOV))
15285         return true;
15286
15287       if ((which & AARCH64_CHECK_BIC) != 0
15288           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
15289                                                  simd_immediate_info::MVN))
15290         return true;
15291
15292       /* Try using a replicated byte.  */
15293       if (which == AARCH64_CHECK_MOV
15294           && val16 == (val32 >> 16)
15295           && val8 == (val16 >> 8))
15296         {
15297           if (info)
15298             *info = simd_immediate_info (QImode, val8);
15299           return true;
15300         }
15301     }
15302
15303   /* Try using a bit-to-bytemask.  */
15304   if (which == AARCH64_CHECK_MOV)
15305     {
15306       unsigned int i;
15307       for (i = 0; i < 64; i += 8)
15308         {
15309           unsigned char byte = (val64 >> i) & 0xff;
15310           if (byte != 0 && byte != 0xff)
15311             break;
15312         }
15313       if (i == 64)
15314         {
15315           if (info)
15316             *info = simd_immediate_info (DImode, val64);
15317           return true;
15318         }
15319     }
15320   return false;
15321 }
15322
15323 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
15324    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
15325
15326 static bool
15327 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
15328                              simd_immediate_info *info)
15329 {
15330   scalar_int_mode mode = DImode;
15331   unsigned int val32 = val64 & 0xffffffff;
15332   if (val32 == (val64 >> 32))
15333     {
15334       mode = SImode;
15335       unsigned int val16 = val32 & 0xffff;
15336       if (val16 == (val32 >> 16))
15337         {
15338           mode = HImode;
15339           unsigned int val8 = val16 & 0xff;
15340           if (val8 == (val16 >> 8))
15341             mode = QImode;
15342         }
15343     }
15344   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
15345   if (IN_RANGE (val, -0x80, 0x7f))
15346     {
15347       /* DUP with no shift.  */
15348       if (info)
15349         *info = simd_immediate_info (mode, val);
15350       return true;
15351     }
15352   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
15353     {
15354       /* DUP with LSL #8.  */
15355       if (info)
15356         *info = simd_immediate_info (mode, val);
15357       return true;
15358     }
15359   if (aarch64_bitmask_imm (val64, mode))
15360     {
15361       /* DUPM.  */
15362       if (info)
15363         *info = simd_immediate_info (mode, val);
15364       return true;
15365     }
15366   return false;
15367 }
15368
15369 /* Return true if X is a valid SVE predicate.  If INFO is nonnull, use
15370    it to describe valid immediates.  */
15371
15372 static bool
15373 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
15374 {
15375   if (x == CONST0_RTX (GET_MODE (x)))
15376     {
15377       if (info)
15378         *info = simd_immediate_info (DImode, 0);
15379       return true;
15380     }
15381
15382   /* Analyze the value as a VNx16BImode.  This should be relatively
15383      efficient, since rtx_vector_builder has enough built-in capacity
15384      to store all VLA predicate constants without needing the heap.  */
15385   rtx_vector_builder builder;
15386   if (!aarch64_get_sve_pred_bits (builder, x))
15387     return false;
15388
15389   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
15390   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
15391     {
15392       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
15393       aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
15394       if (pattern != AARCH64_NUM_SVPATTERNS)
15395         {
15396           if (info)
15397             {
15398               scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
15399               *info = simd_immediate_info (int_mode, pattern);
15400             }
15401           return true;
15402         }
15403     }
15404   return false;
15405 }
15406
15407 /* Return true if OP is a valid SIMD immediate for the operation
15408    described by WHICH.  If INFO is nonnull, use it to describe valid
15409    immediates.  */
15410 bool
15411 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
15412                               enum simd_immediate_check which)
15413 {
15414   machine_mode mode = GET_MODE (op);
15415   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15416   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15417     return false;
15418
15419   if (vec_flags & VEC_SVE_PRED)
15420     return aarch64_sve_pred_valid_immediate (op, info);
15421
15422   scalar_mode elt_mode = GET_MODE_INNER (mode);
15423   rtx base, step;
15424   unsigned int n_elts;
15425   if (GET_CODE (op) == CONST_VECTOR
15426       && CONST_VECTOR_DUPLICATE_P (op))
15427     n_elts = CONST_VECTOR_NPATTERNS (op);
15428   else if ((vec_flags & VEC_SVE_DATA)
15429            && const_vec_series_p (op, &base, &step))
15430     {
15431       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
15432       if (!aarch64_sve_index_immediate_p (base)
15433           || !aarch64_sve_index_immediate_p (step))
15434         return false;
15435
15436       if (info)
15437         *info = simd_immediate_info (elt_mode, base, step);
15438       return true;
15439     }
15440   else if (GET_CODE (op) == CONST_VECTOR
15441            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
15442     /* N_ELTS set above.  */;
15443   else
15444     return false;
15445
15446   scalar_float_mode elt_float_mode;
15447   if (n_elts == 1
15448       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
15449     {
15450       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
15451       if (aarch64_float_const_zero_rtx_p (elt)
15452           || aarch64_float_const_representable_p (elt))
15453         {
15454           if (info)
15455             *info = simd_immediate_info (elt_float_mode, elt);
15456           return true;
15457         }
15458     }
15459
15460   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
15461   if (elt_size > 8)
15462     return false;
15463
15464   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
15465
15466   /* Expand the vector constant out into a byte vector, with the least
15467      significant byte of the register first.  */
15468   auto_vec<unsigned char, 16> bytes;
15469   bytes.reserve (n_elts * elt_size);
15470   for (unsigned int i = 0; i < n_elts; i++)
15471     {
15472       /* The vector is provided in gcc endian-neutral fashion.
15473          For aarch64_be Advanced SIMD, it must be laid out in the vector
15474          register in reverse order.  */
15475       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
15476       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
15477
15478       if (elt_mode != elt_int_mode)
15479         elt = gen_lowpart (elt_int_mode, elt);
15480
15481       if (!CONST_INT_P (elt))
15482         return false;
15483
15484       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
15485       for (unsigned int byte = 0; byte < elt_size; byte++)
15486         {
15487           bytes.quick_push (elt_val & 0xff);
15488           elt_val >>= BITS_PER_UNIT;
15489         }
15490     }
15491
15492   /* The immediate must repeat every eight bytes.  */
15493   unsigned int nbytes = bytes.length ();
15494   for (unsigned i = 8; i < nbytes; ++i)
15495     if (bytes[i] != bytes[i - 8])
15496       return false;
15497
15498   /* Get the repeating 8-byte value as an integer.  No endian correction
15499      is needed here because bytes is already in lsb-first order.  */
15500   unsigned HOST_WIDE_INT val64 = 0;
15501   for (unsigned int i = 0; i < 8; i++)
15502     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
15503               << (i * BITS_PER_UNIT));
15504
15505   if (vec_flags & VEC_SVE_DATA)
15506     return aarch64_sve_valid_immediate (val64, info);
15507   else
15508     return aarch64_advsimd_valid_immediate (val64, info, which);
15509 }
15510
15511 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
15512    has a step in the range of INDEX.  Return the index expression if so,
15513    otherwise return null.  */
15514 rtx
15515 aarch64_check_zero_based_sve_index_immediate (rtx x)
15516 {
15517   rtx base, step;
15518   if (const_vec_series_p (x, &base, &step)
15519       && base == const0_rtx
15520       && aarch64_sve_index_immediate_p (step))
15521     return step;
15522   return NULL_RTX;
15523 }
15524
15525 /* Check of immediate shift constants are within range.  */
15526 bool
15527 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
15528 {
15529   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
15530   if (left)
15531     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
15532   else
15533     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
15534 }
15535
15536 /* Return the bitmask CONST_INT to select the bits required by a zero extract
15537    operation of width WIDTH at bit position POS.  */
15538
15539 rtx
15540 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
15541 {
15542   gcc_assert (CONST_INT_P (width));
15543   gcc_assert (CONST_INT_P (pos));
15544
15545   unsigned HOST_WIDE_INT mask
15546     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
15547   return GEN_INT (mask << UINTVAL (pos));
15548 }
15549
15550 bool
15551 aarch64_mov_operand_p (rtx x, machine_mode mode)
15552 {
15553   if (GET_CODE (x) == HIGH
15554       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
15555     return true;
15556
15557   if (CONST_INT_P (x))
15558     return true;
15559
15560   if (VECTOR_MODE_P (GET_MODE (x)))
15561     {
15562       /* Require predicate constants to be VNx16BI before RA, so that we
15563          force everything to have a canonical form.  */
15564       if (!lra_in_progress
15565           && !reload_completed
15566           && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
15567           && GET_MODE (x) != VNx16BImode)
15568         return false;
15569
15570       return aarch64_simd_valid_immediate (x, NULL);
15571     }
15572
15573   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
15574     return true;
15575
15576   if (aarch64_sve_cnt_immediate_p (x))
15577     return true;
15578
15579   return aarch64_classify_symbolic_expression (x)
15580     == SYMBOL_TINY_ABSOLUTE;
15581 }
15582
15583 /* Return a const_int vector of VAL.  */
15584 rtx
15585 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
15586 {
15587   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
15588   return gen_const_vec_duplicate (mode, c);
15589 }
15590
15591 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
15592
15593 bool
15594 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
15595 {
15596   machine_mode vmode;
15597
15598   vmode = aarch64_simd_container_mode (mode, 64);
15599   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
15600   return aarch64_simd_valid_immediate (op_v, NULL);
15601 }
15602
15603 /* Construct and return a PARALLEL RTX vector with elements numbering the
15604    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
15605    the vector - from the perspective of the architecture.  This does not
15606    line up with GCC's perspective on lane numbers, so we end up with
15607    different masks depending on our target endian-ness.  The diagram
15608    below may help.  We must draw the distinction when building masks
15609    which select one half of the vector.  An instruction selecting
15610    architectural low-lanes for a big-endian target, must be described using
15611    a mask selecting GCC high-lanes.
15612
15613                  Big-Endian             Little-Endian
15614
15615 GCC             0   1   2   3           3   2   1   0
15616               | x | x | x | x |       | x | x | x | x |
15617 Architecture    3   2   1   0           3   2   1   0
15618
15619 Low Mask:         { 2, 3 }                { 0, 1 }
15620 High Mask:        { 0, 1 }                { 2, 3 }
15621
15622    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
15623
15624 rtx
15625 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
15626 {
15627   rtvec v = rtvec_alloc (nunits / 2);
15628   int high_base = nunits / 2;
15629   int low_base = 0;
15630   int base;
15631   rtx t1;
15632   int i;
15633
15634   if (BYTES_BIG_ENDIAN)
15635     base = high ? low_base : high_base;
15636   else
15637     base = high ? high_base : low_base;
15638
15639   for (i = 0; i < nunits / 2; i++)
15640     RTVEC_ELT (v, i) = GEN_INT (base + i);
15641
15642   t1 = gen_rtx_PARALLEL (mode, v);
15643   return t1;
15644 }
15645
15646 /* Check OP for validity as a PARALLEL RTX vector with elements
15647    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
15648    from the perspective of the architecture.  See the diagram above
15649    aarch64_simd_vect_par_cnst_half for more details.  */
15650
15651 bool
15652 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
15653                                        bool high)
15654 {
15655   int nelts;
15656   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
15657     return false;
15658
15659   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
15660   HOST_WIDE_INT count_op = XVECLEN (op, 0);
15661   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
15662   int i = 0;
15663
15664   if (count_op != count_ideal)
15665     return false;
15666
15667   for (i = 0; i < count_ideal; i++)
15668     {
15669       rtx elt_op = XVECEXP (op, 0, i);
15670       rtx elt_ideal = XVECEXP (ideal, 0, i);
15671
15672       if (!CONST_INT_P (elt_op)
15673           || INTVAL (elt_ideal) != INTVAL (elt_op))
15674         return false;
15675     }
15676   return true;
15677 }
15678
15679 /* Return a PARALLEL containing NELTS elements, with element I equal
15680    to BASE + I * STEP.  */
15681
15682 rtx
15683 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
15684 {
15685   rtvec vec = rtvec_alloc (nelts);
15686   for (unsigned int i = 0; i < nelts; ++i)
15687     RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
15688   return gen_rtx_PARALLEL (VOIDmode, vec);
15689 }
15690
15691 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
15692    series with step STEP.  */
15693
15694 bool
15695 aarch64_stepped_int_parallel_p (rtx op, int step)
15696 {
15697   if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
15698     return false;
15699
15700   unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
15701   for (int i = 1; i < XVECLEN (op, 0); ++i)
15702     if (!CONST_INT_P (XVECEXP (op, 0, i))
15703         || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
15704       return false;
15705
15706   return true;
15707 }
15708
15709 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
15710    HIGH (exclusive).  */
15711 void
15712 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
15713                           const_tree exp)
15714 {
15715   HOST_WIDE_INT lane;
15716   gcc_assert (CONST_INT_P (operand));
15717   lane = INTVAL (operand);
15718
15719   if (lane < low || lane >= high)
15720   {
15721     if (exp)
15722       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
15723     else
15724       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
15725   }
15726 }
15727
15728 /* Peform endian correction on lane number N, which indexes a vector
15729    of mode MODE, and return the result as an SImode rtx.  */
15730
15731 rtx
15732 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
15733 {
15734   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
15735 }
15736
15737 /* Return TRUE if OP is a valid vector addressing mode.  */
15738
15739 bool
15740 aarch64_simd_mem_operand_p (rtx op)
15741 {
15742   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
15743                         || REG_P (XEXP (op, 0)));
15744 }
15745
15746 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
15747
15748 bool
15749 aarch64_sve_ld1r_operand_p (rtx op)
15750 {
15751   struct aarch64_address_info addr;
15752   scalar_mode mode;
15753
15754   return (MEM_P (op)
15755           && is_a <scalar_mode> (GET_MODE (op), &mode)
15756           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
15757           && addr.type == ADDRESS_REG_IMM
15758           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
15759 }
15760
15761 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
15762 bool
15763 aarch64_sve_ld1rq_operand_p (rtx op)
15764 {
15765   struct aarch64_address_info addr;
15766   scalar_mode elem_mode = GET_MODE_INNER (GET_MODE (op));
15767   if (!MEM_P (op)
15768       || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
15769     return false;
15770
15771   if (addr.type == ADDRESS_REG_IMM)
15772     return offset_4bit_signed_scaled_p (TImode, addr.const_offset);
15773
15774   if (addr.type == ADDRESS_REG_REG)
15775     return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
15776
15777   return false;
15778 }
15779
15780 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
15781    The conditions for STR are the same.  */
15782 bool
15783 aarch64_sve_ldr_operand_p (rtx op)
15784 {
15785   struct aarch64_address_info addr;
15786
15787   return (MEM_P (op)
15788           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
15789                                        false, ADDR_QUERY_ANY)
15790           && addr.type == ADDRESS_REG_IMM);
15791 }
15792
15793 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
15794    We need to be able to access the individual pieces, so the range
15795    is different from LD[234] and ST[234].  */
15796 bool
15797 aarch64_sve_struct_memory_operand_p (rtx op)
15798 {
15799   if (!MEM_P (op))
15800     return false;
15801
15802   machine_mode mode = GET_MODE (op);
15803   struct aarch64_address_info addr;
15804   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
15805                                  ADDR_QUERY_ANY)
15806       || addr.type != ADDRESS_REG_IMM)
15807     return false;
15808
15809   poly_int64 first = addr.const_offset;
15810   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
15811   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
15812           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
15813 }
15814
15815 /* Emit a register copy from operand to operand, taking care not to
15816    early-clobber source registers in the process.
15817
15818    COUNT is the number of components into which the copy needs to be
15819    decomposed.  */
15820 void
15821 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
15822                                 unsigned int count)
15823 {
15824   unsigned int i;
15825   int rdest = REGNO (operands[0]);
15826   int rsrc = REGNO (operands[1]);
15827
15828   if (!reg_overlap_mentioned_p (operands[0], operands[1])
15829       || rdest < rsrc)
15830     for (i = 0; i < count; i++)
15831       emit_move_insn (gen_rtx_REG (mode, rdest + i),
15832                       gen_rtx_REG (mode, rsrc + i));
15833   else
15834     for (i = 0; i < count; i++)
15835       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
15836                       gen_rtx_REG (mode, rsrc + count - i - 1));
15837 }
15838
15839 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
15840    one of VSTRUCT modes: OI, CI, or XI.  */
15841 int
15842 aarch64_simd_attr_length_rglist (machine_mode mode)
15843 {
15844   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
15845   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
15846 }
15847
15848 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
15849    alignment of a vector to 128 bits.  SVE predicates have an alignment of
15850    16 bits.  */
15851 static HOST_WIDE_INT
15852 aarch64_simd_vector_alignment (const_tree type)
15853 {
15854   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15855     /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
15856        be set for non-predicate vectors of booleans.  Modes are the most
15857        direct way we have of identifying real SVE predicate types.  */
15858     return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
15859   return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
15860 }
15861
15862 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
15863 static poly_uint64
15864 aarch64_vectorize_preferred_vector_alignment (const_tree type)
15865 {
15866   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
15867     {
15868       /* If the length of the vector is fixed, try to align to that length,
15869          otherwise don't try to align at all.  */
15870       HOST_WIDE_INT result;
15871       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
15872         result = TYPE_ALIGN (TREE_TYPE (type));
15873       return result;
15874     }
15875   return TYPE_ALIGN (type);
15876 }
15877
15878 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
15879 static bool
15880 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
15881 {
15882   if (is_packed)
15883     return false;
15884
15885   /* For fixed-length vectors, check that the vectorizer will aim for
15886      full-vector alignment.  This isn't true for generic GCC vectors
15887      that are wider than the ABI maximum of 128 bits.  */
15888   poly_uint64 preferred_alignment =
15889     aarch64_vectorize_preferred_vector_alignment (type);
15890   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
15891       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
15892                    preferred_alignment))
15893     return false;
15894
15895   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
15896   return true;
15897 }
15898
15899 /* Return true if the vector misalignment factor is supported by the
15900    target.  */
15901 static bool
15902 aarch64_builtin_support_vector_misalignment (machine_mode mode,
15903                                              const_tree type, int misalignment,
15904                                              bool is_packed)
15905 {
15906   if (TARGET_SIMD && STRICT_ALIGNMENT)
15907     {
15908       /* Return if movmisalign pattern is not supported for this mode.  */
15909       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
15910         return false;
15911
15912       /* Misalignment factor is unknown at compile time.  */
15913       if (misalignment == -1)
15914         return false;
15915     }
15916   return default_builtin_support_vector_misalignment (mode, type, misalignment,
15917                                                       is_packed);
15918 }
15919
15920 /* If VALS is a vector constant that can be loaded into a register
15921    using DUP, generate instructions to do so and return an RTX to
15922    assign to the register.  Otherwise return NULL_RTX.  */
15923 static rtx
15924 aarch64_simd_dup_constant (rtx vals)
15925 {
15926   machine_mode mode = GET_MODE (vals);
15927   machine_mode inner_mode = GET_MODE_INNER (mode);
15928   rtx x;
15929
15930   if (!const_vec_duplicate_p (vals, &x))
15931     return NULL_RTX;
15932
15933   /* We can load this constant by using DUP and a constant in a
15934      single ARM register.  This will be cheaper than a vector
15935      load.  */
15936   x = copy_to_mode_reg (inner_mode, x);
15937   return gen_vec_duplicate (mode, x);
15938 }
15939
15940
15941 /* Generate code to load VALS, which is a PARALLEL containing only
15942    constants (for vec_init) or CONST_VECTOR, efficiently into a
15943    register.  Returns an RTX to copy into the register, or NULL_RTX
15944    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
15945 static rtx
15946 aarch64_simd_make_constant (rtx vals)
15947 {
15948   machine_mode mode = GET_MODE (vals);
15949   rtx const_dup;
15950   rtx const_vec = NULL_RTX;
15951   int n_const = 0;
15952   int i;
15953
15954   if (GET_CODE (vals) == CONST_VECTOR)
15955     const_vec = vals;
15956   else if (GET_CODE (vals) == PARALLEL)
15957     {
15958       /* A CONST_VECTOR must contain only CONST_INTs and
15959          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
15960          Only store valid constants in a CONST_VECTOR.  */
15961       int n_elts = XVECLEN (vals, 0);
15962       for (i = 0; i < n_elts; ++i)
15963         {
15964           rtx x = XVECEXP (vals, 0, i);
15965           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15966             n_const++;
15967         }
15968       if (n_const == n_elts)
15969         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
15970     }
15971   else
15972     gcc_unreachable ();
15973
15974   if (const_vec != NULL_RTX
15975       && aarch64_simd_valid_immediate (const_vec, NULL))
15976     /* Load using MOVI/MVNI.  */
15977     return const_vec;
15978   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
15979     /* Loaded using DUP.  */
15980     return const_dup;
15981   else if (const_vec != NULL_RTX)
15982     /* Load from constant pool. We cannot take advantage of single-cycle
15983        LD1 because we need a PC-relative addressing mode.  */
15984     return const_vec;
15985   else
15986     /* A PARALLEL containing something not valid inside CONST_VECTOR.
15987        We cannot construct an initializer.  */
15988     return NULL_RTX;
15989 }
15990
15991 /* Expand a vector initialisation sequence, such that TARGET is
15992    initialised to contain VALS.  */
15993
15994 void
15995 aarch64_expand_vector_init (rtx target, rtx vals)
15996 {
15997   machine_mode mode = GET_MODE (target);
15998   scalar_mode inner_mode = GET_MODE_INNER (mode);
15999   /* The number of vector elements.  */
16000   int n_elts = XVECLEN (vals, 0);
16001   /* The number of vector elements which are not constant.  */
16002   int n_var = 0;
16003   rtx any_const = NULL_RTX;
16004   /* The first element of vals.  */
16005   rtx v0 = XVECEXP (vals, 0, 0);
16006   bool all_same = true;
16007
16008   /* This is a special vec_init<M><N> where N is not an element mode but a
16009      vector mode with half the elements of M.  We expect to find two entries
16010      of mode N in VALS and we must put their concatentation into TARGET.  */
16011   if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
16012     {
16013       gcc_assert (known_eq (GET_MODE_SIZE (mode),
16014                   2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
16015       rtx lo = XVECEXP (vals, 0, 0);
16016       rtx hi = XVECEXP (vals, 0, 1);
16017       machine_mode narrow_mode = GET_MODE (lo);
16018       gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
16019       gcc_assert (narrow_mode == GET_MODE (hi));
16020
16021       /* When we want to concatenate a half-width vector with zeroes we can
16022          use the aarch64_combinez[_be] patterns.  Just make sure that the
16023          zeroes are in the right half.  */
16024       if (BYTES_BIG_ENDIAN
16025           && aarch64_simd_imm_zero (lo, narrow_mode)
16026           && general_operand (hi, narrow_mode))
16027         emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
16028       else if (!BYTES_BIG_ENDIAN
16029                && aarch64_simd_imm_zero (hi, narrow_mode)
16030                && general_operand (lo, narrow_mode))
16031         emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
16032       else
16033         {
16034           /* Else create the two half-width registers and combine them.  */
16035           if (!REG_P (lo))
16036             lo = force_reg (GET_MODE (lo), lo);
16037           if (!REG_P (hi))
16038             hi = force_reg (GET_MODE (hi), hi);
16039
16040           if (BYTES_BIG_ENDIAN)
16041             std::swap (lo, hi);
16042           emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
16043         }
16044      return;
16045    }
16046
16047   /* Count the number of variable elements to initialise.  */
16048   for (int i = 0; i < n_elts; ++i)
16049     {
16050       rtx x = XVECEXP (vals, 0, i);
16051       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
16052         ++n_var;
16053       else
16054         any_const = x;
16055
16056       all_same &= rtx_equal_p (x, v0);
16057     }
16058
16059   /* No variable elements, hand off to aarch64_simd_make_constant which knows
16060      how best to handle this.  */
16061   if (n_var == 0)
16062     {
16063       rtx constant = aarch64_simd_make_constant (vals);
16064       if (constant != NULL_RTX)
16065         {
16066           emit_move_insn (target, constant);
16067           return;
16068         }
16069     }
16070
16071   /* Splat a single non-constant element if we can.  */
16072   if (all_same)
16073     {
16074       rtx x = copy_to_mode_reg (inner_mode, v0);
16075       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
16076       return;
16077     }
16078
16079   enum insn_code icode = optab_handler (vec_set_optab, mode);
16080   gcc_assert (icode != CODE_FOR_nothing);
16081
16082   /* If there are only variable elements, try to optimize
16083      the insertion using dup for the most common element
16084      followed by insertions.  */
16085
16086   /* The algorithm will fill matches[*][0] with the earliest matching element,
16087      and matches[X][1] with the count of duplicate elements (if X is the
16088      earliest element which has duplicates).  */
16089
16090   if (n_var == n_elts && n_elts <= 16)
16091     {
16092       int matches[16][2] = {0};
16093       for (int i = 0; i < n_elts; i++)
16094         {
16095           for (int j = 0; j <= i; j++)
16096             {
16097               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
16098                 {
16099                   matches[i][0] = j;
16100                   matches[j][1]++;
16101                   break;
16102                 }
16103             }
16104         }
16105       int maxelement = 0;
16106       int maxv = 0;
16107       for (int i = 0; i < n_elts; i++)
16108         if (matches[i][1] > maxv)
16109           {
16110             maxelement = i;
16111             maxv = matches[i][1];
16112           }
16113
16114       /* Create a duplicate of the most common element, unless all elements
16115          are equally useless to us, in which case just immediately set the
16116          vector register using the first element.  */
16117
16118       if (maxv == 1)
16119         {
16120           /* For vectors of two 64-bit elements, we can do even better.  */
16121           if (n_elts == 2
16122               && (inner_mode == E_DImode
16123                   || inner_mode == E_DFmode))
16124
16125             {
16126               rtx x0 = XVECEXP (vals, 0, 0);
16127               rtx x1 = XVECEXP (vals, 0, 1);
16128               /* Combine can pick up this case, but handling it directly
16129                  here leaves clearer RTL.
16130
16131                  This is load_pair_lanes<mode>, and also gives us a clean-up
16132                  for store_pair_lanes<mode>.  */
16133               if (memory_operand (x0, inner_mode)
16134                   && memory_operand (x1, inner_mode)
16135                   && !STRICT_ALIGNMENT
16136                   && rtx_equal_p (XEXP (x1, 0),
16137                                   plus_constant (Pmode,
16138                                                  XEXP (x0, 0),
16139                                                  GET_MODE_SIZE (inner_mode))))
16140                 {
16141                   rtx t;
16142                   if (inner_mode == DFmode)
16143                     t = gen_load_pair_lanesdf (target, x0, x1);
16144                   else
16145                     t = gen_load_pair_lanesdi (target, x0, x1);
16146                   emit_insn (t);
16147                   return;
16148                 }
16149             }
16150           /* The subreg-move sequence below will move into lane zero of the
16151              vector register.  For big-endian we want that position to hold
16152              the last element of VALS.  */
16153           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
16154           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
16155           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
16156         }
16157       else
16158         {
16159           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
16160           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
16161         }
16162
16163       /* Insert the rest.  */
16164       for (int i = 0; i < n_elts; i++)
16165         {
16166           rtx x = XVECEXP (vals, 0, i);
16167           if (matches[i][0] == maxelement)
16168             continue;
16169           x = copy_to_mode_reg (inner_mode, x);
16170           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
16171         }
16172       return;
16173     }
16174
16175   /* Initialise a vector which is part-variable.  We want to first try
16176      to build those lanes which are constant in the most efficient way we
16177      can.  */
16178   if (n_var != n_elts)
16179     {
16180       rtx copy = copy_rtx (vals);
16181
16182       /* Load constant part of vector.  We really don't care what goes into the
16183          parts we will overwrite, but we're more likely to be able to load the
16184          constant efficiently if it has fewer, larger, repeating parts
16185          (see aarch64_simd_valid_immediate).  */
16186       for (int i = 0; i < n_elts; i++)
16187         {
16188           rtx x = XVECEXP (vals, 0, i);
16189           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16190             continue;
16191           rtx subst = any_const;
16192           for (int bit = n_elts / 2; bit > 0; bit /= 2)
16193             {
16194               /* Look in the copied vector, as more elements are const.  */
16195               rtx test = XVECEXP (copy, 0, i ^ bit);
16196               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
16197                 {
16198                   subst = test;
16199                   break;
16200                 }
16201             }
16202           XVECEXP (copy, 0, i) = subst;
16203         }
16204       aarch64_expand_vector_init (target, copy);
16205     }
16206
16207   /* Insert the variable lanes directly.  */
16208   for (int i = 0; i < n_elts; i++)
16209     {
16210       rtx x = XVECEXP (vals, 0, i);
16211       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16212         continue;
16213       x = copy_to_mode_reg (inner_mode, x);
16214       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
16215     }
16216 }
16217
16218 /* Emit RTL corresponding to:
16219    insr TARGET, ELEM.  */
16220
16221 static void
16222 emit_insr (rtx target, rtx elem)
16223 {
16224   machine_mode mode = GET_MODE (target);
16225   scalar_mode elem_mode = GET_MODE_INNER (mode);
16226   elem = force_reg (elem_mode, elem);
16227
16228   insn_code icode = optab_handler (vec_shl_insert_optab, mode);
16229   gcc_assert (icode != CODE_FOR_nothing);
16230   emit_insn (GEN_FCN (icode) (target, target, elem));
16231 }
16232
16233 /* Subroutine of aarch64_sve_expand_vector_init for handling
16234    trailing constants.
16235    This function works as follows:
16236    (a) Create a new vector consisting of trailing constants.
16237    (b) Initialize TARGET with the constant vector using emit_move_insn.
16238    (c) Insert remaining elements in TARGET using insr.
16239    NELTS is the total number of elements in original vector while
16240    while NELTS_REQD is the number of elements that are actually
16241    significant.
16242
16243    ??? The heuristic used is to do above only if number of constants
16244    is at least half the total number of elements.  May need fine tuning.  */
16245
16246 static bool
16247 aarch64_sve_expand_vector_init_handle_trailing_constants
16248  (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
16249 {
16250   machine_mode mode = GET_MODE (target);
16251   scalar_mode elem_mode = GET_MODE_INNER (mode);
16252   int n_trailing_constants = 0;
16253
16254   for (int i = nelts_reqd - 1;
16255        i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
16256        i--)
16257     n_trailing_constants++;
16258
16259   if (n_trailing_constants >= nelts_reqd / 2)
16260     {
16261       rtx_vector_builder v (mode, 1, nelts);
16262       for (int i = 0; i < nelts; i++)
16263         v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
16264       rtx const_vec = v.build ();
16265       emit_move_insn (target, const_vec);
16266
16267       for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
16268         emit_insr (target, builder.elt (i));
16269
16270       return true;
16271     }
16272
16273   return false;
16274 }
16275
16276 /* Subroutine of aarch64_sve_expand_vector_init.
16277    Works as follows:
16278    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
16279    (b) Skip trailing elements from BUILDER, which are the same as
16280        element NELTS_REQD - 1.
16281    (c) Insert earlier elements in reverse order in TARGET using insr.  */
16282
16283 static void
16284 aarch64_sve_expand_vector_init_insert_elems (rtx target,
16285                                              const rtx_vector_builder &builder,
16286                                              int nelts_reqd)
16287 {
16288   machine_mode mode = GET_MODE (target);
16289   scalar_mode elem_mode = GET_MODE_INNER (mode);
16290
16291   struct expand_operand ops[2];
16292   enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
16293   gcc_assert (icode != CODE_FOR_nothing);
16294
16295   create_output_operand (&ops[0], target, mode);
16296   create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
16297   expand_insn (icode, 2, ops);
16298
16299   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16300   for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
16301     emit_insr (target, builder.elt (i));
16302 }
16303
16304 /* Subroutine of aarch64_sve_expand_vector_init to handle case
16305    when all trailing elements of builder are same.
16306    This works as follows:
16307    (a) Use expand_insn interface to broadcast last vector element in TARGET.
16308    (b) Insert remaining elements in TARGET using insr.
16309
16310    ??? The heuristic used is to do above if number of same trailing elements
16311    is at least 3/4 of total number of elements, loosely based on
16312    heuristic from mostly_zeros_p.  May need fine-tuning.  */
16313
16314 static bool
16315 aarch64_sve_expand_vector_init_handle_trailing_same_elem
16316  (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
16317 {
16318   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16319   if (ndups >= (3 * nelts_reqd) / 4)
16320     {
16321       aarch64_sve_expand_vector_init_insert_elems (target, builder,
16322                                                    nelts_reqd - ndups + 1);
16323       return true;
16324     }
16325
16326   return false;
16327 }
16328
16329 /* Initialize register TARGET from BUILDER. NELTS is the constant number
16330    of elements in BUILDER.
16331
16332    The function tries to initialize TARGET from BUILDER if it fits one
16333    of the special cases outlined below.
16334
16335    Failing that, the function divides BUILDER into two sub-vectors:
16336    v_even = even elements of BUILDER;
16337    v_odd = odd elements of BUILDER;
16338
16339    and recursively calls itself with v_even and v_odd.
16340
16341    if (recursive call succeeded for v_even or v_odd)
16342      TARGET = zip (v_even, v_odd)
16343
16344    The function returns true if it managed to build TARGET from BUILDER
16345    with one of the special cases, false otherwise.
16346
16347    Example: {a, 1, b, 2, c, 3, d, 4}
16348
16349    The vector gets divided into:
16350    v_even = {a, b, c, d}
16351    v_odd = {1, 2, 3, 4}
16352
16353    aarch64_sve_expand_vector_init(v_odd) hits case 1 and
16354    initialize tmp2 from constant vector v_odd using emit_move_insn.
16355
16356    aarch64_sve_expand_vector_init(v_even) fails since v_even contains
16357    4 elements, so we construct tmp1 from v_even using insr:
16358    tmp1 = dup(d)
16359    insr tmp1, c
16360    insr tmp1, b
16361    insr tmp1, a
16362
16363    And finally:
16364    TARGET = zip (tmp1, tmp2)
16365    which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
16366
16367 static bool
16368 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
16369                                 int nelts, int nelts_reqd)
16370 {
16371   machine_mode mode = GET_MODE (target);
16372
16373   /* Case 1: Vector contains trailing constants.  */
16374
16375   if (aarch64_sve_expand_vector_init_handle_trailing_constants
16376        (target, builder, nelts, nelts_reqd))
16377     return true;
16378
16379   /* Case 2: Vector contains leading constants.  */
16380
16381   rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
16382   for (int i = 0; i < nelts_reqd; i++)
16383     rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
16384   rev_builder.finalize ();
16385
16386   if (aarch64_sve_expand_vector_init_handle_trailing_constants
16387        (target, rev_builder, nelts, nelts_reqd))
16388     {
16389       emit_insn (gen_aarch64_sve_rev (mode, target, target));
16390       return true;
16391     }
16392
16393   /* Case 3: Vector contains trailing same element.  */
16394
16395   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16396        (target, builder, nelts_reqd))
16397     return true;
16398
16399   /* Case 4: Vector contains leading same element.  */
16400
16401   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16402        (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
16403     {
16404       emit_insn (gen_aarch64_sve_rev (mode, target, target));
16405       return true;
16406     }
16407
16408   /* Avoid recursing below 4-elements.
16409      ??? The threshold 4 may need fine-tuning.  */
16410
16411   if (nelts_reqd <= 4)
16412     return false;
16413
16414   rtx_vector_builder v_even (mode, 1, nelts);
16415   rtx_vector_builder v_odd (mode, 1, nelts);
16416
16417   for (int i = 0; i < nelts * 2; i += 2)
16418     {
16419       v_even.quick_push (builder.elt (i));
16420       v_odd.quick_push (builder.elt (i + 1));
16421     }
16422
16423   v_even.finalize ();
16424   v_odd.finalize ();
16425
16426   rtx tmp1 = gen_reg_rtx (mode);
16427   bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
16428                                                     nelts, nelts_reqd / 2);
16429
16430   rtx tmp2 = gen_reg_rtx (mode);
16431   bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
16432                                                    nelts, nelts_reqd / 2);
16433
16434   if (!did_even_p && !did_odd_p)
16435     return false;
16436
16437   /* Initialize v_even and v_odd using INSR if it didn't match any of the
16438      special cases and zip v_even, v_odd.  */
16439
16440   if (!did_even_p)
16441     aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
16442
16443   if (!did_odd_p)
16444     aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
16445
16446   rtvec v = gen_rtvec (2, tmp1, tmp2);
16447   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
16448   return true;
16449 }
16450
16451 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
16452
16453 void
16454 aarch64_sve_expand_vector_init (rtx target, rtx vals)
16455 {
16456   machine_mode mode = GET_MODE (target);
16457   int nelts = XVECLEN (vals, 0);
16458
16459   rtx_vector_builder v (mode, 1, nelts);
16460   for (int i = 0; i < nelts; i++)
16461     v.quick_push (XVECEXP (vals, 0, i));
16462   v.finalize ();
16463
16464   /* If neither sub-vectors of v could be initialized specially,
16465      then use INSR to insert all elements from v into TARGET.
16466      ??? This might not be optimal for vectors with large
16467      initializers like 16-element or above.
16468      For nelts < 4, it probably isn't useful to handle specially.  */
16469
16470   if (nelts < 4
16471       || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
16472     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
16473 }
16474
16475 static unsigned HOST_WIDE_INT
16476 aarch64_shift_truncation_mask (machine_mode mode)
16477 {
16478   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
16479     return 0;
16480   return GET_MODE_UNIT_BITSIZE (mode) - 1;
16481 }
16482
16483 /* Select a format to encode pointers in exception handling data.  */
16484 int
16485 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
16486 {
16487    int type;
16488    switch (aarch64_cmodel)
16489      {
16490      case AARCH64_CMODEL_TINY:
16491      case AARCH64_CMODEL_TINY_PIC:
16492      case AARCH64_CMODEL_SMALL:
16493      case AARCH64_CMODEL_SMALL_PIC:
16494      case AARCH64_CMODEL_SMALL_SPIC:
16495        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
16496           for everything.  */
16497        type = DW_EH_PE_sdata4;
16498        break;
16499      default:
16500        /* No assumptions here.  8-byte relocs required.  */
16501        type = DW_EH_PE_sdata8;
16502        break;
16503      }
16504    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
16505 }
16506
16507 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
16508
16509 static void
16510 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
16511 {
16512   if (aarch64_simd_decl_p (decl))
16513     {
16514       fprintf (stream, "\t.variant_pcs\t");
16515       assemble_name (stream, name);
16516       fprintf (stream, "\n");
16517     }
16518 }
16519
16520 /* The last .arch and .tune assembly strings that we printed.  */
16521 static std::string aarch64_last_printed_arch_string;
16522 static std::string aarch64_last_printed_tune_string;
16523
16524 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
16525    by the function fndecl.  */
16526
16527 void
16528 aarch64_declare_function_name (FILE *stream, const char* name,
16529                                 tree fndecl)
16530 {
16531   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
16532
16533   struct cl_target_option *targ_options;
16534   if (target_parts)
16535     targ_options = TREE_TARGET_OPTION (target_parts);
16536   else
16537     targ_options = TREE_TARGET_OPTION (target_option_current_node);
16538   gcc_assert (targ_options);
16539
16540   const struct processor *this_arch
16541     = aarch64_get_arch (targ_options->x_explicit_arch);
16542
16543   uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
16544   std::string extension
16545     = aarch64_get_extension_string_for_isa_flags (isa_flags,
16546                                                   this_arch->flags);
16547   /* Only update the assembler .arch string if it is distinct from the last
16548      such string we printed.  */
16549   std::string to_print = this_arch->name + extension;
16550   if (to_print != aarch64_last_printed_arch_string)
16551     {
16552       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
16553       aarch64_last_printed_arch_string = to_print;
16554     }
16555
16556   /* Print the cpu name we're tuning for in the comments, might be
16557      useful to readers of the generated asm.  Do it only when it changes
16558      from function to function and verbose assembly is requested.  */
16559   const struct processor *this_tune
16560     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
16561
16562   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
16563     {
16564       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
16565                    this_tune->name);
16566       aarch64_last_printed_tune_string = this_tune->name;
16567     }
16568
16569   aarch64_asm_output_variant_pcs (stream, fndecl, name);
16570
16571   /* Don't forget the type directive for ELF.  */
16572   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
16573   ASM_OUTPUT_LABEL (stream, name);
16574 }
16575
16576 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
16577
16578 void
16579 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
16580 {
16581   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
16582   const char *value = IDENTIFIER_POINTER (target);
16583   aarch64_asm_output_variant_pcs (stream, decl, name);
16584   ASM_OUTPUT_DEF (stream, name, value);
16585 }
16586
16587 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
16588    function symbol references.  */
16589
16590 void
16591 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
16592 {
16593   default_elf_asm_output_external (stream, decl, name);
16594   aarch64_asm_output_variant_pcs (stream, decl, name);
16595 }
16596
16597 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
16598    Used to output the .cfi_b_key_frame directive when signing the current
16599    function with the B key.  */
16600
16601 void
16602 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
16603 {
16604   if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
16605       && aarch64_ra_sign_key == AARCH64_KEY_B)
16606         asm_fprintf (f, "\t.cfi_b_key_frame\n");
16607 }
16608
16609 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
16610
16611 static void
16612 aarch64_start_file (void)
16613 {
16614   struct cl_target_option *default_options
16615     = TREE_TARGET_OPTION (target_option_default_node);
16616
16617   const struct processor *default_arch
16618     = aarch64_get_arch (default_options->x_explicit_arch);
16619   uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
16620   std::string extension
16621     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
16622                                                   default_arch->flags);
16623
16624    aarch64_last_printed_arch_string = default_arch->name + extension;
16625    aarch64_last_printed_tune_string = "";
16626    asm_fprintf (asm_out_file, "\t.arch %s\n",
16627                 aarch64_last_printed_arch_string.c_str ());
16628
16629    default_file_start ();
16630 }
16631
16632 /* Emit load exclusive.  */
16633
16634 static void
16635 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
16636                              rtx mem, rtx model_rtx)
16637 {
16638   emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
16639 }
16640
16641 /* Emit store exclusive.  */
16642
16643 static void
16644 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
16645                               rtx rval, rtx mem, rtx model_rtx)
16646 {
16647   emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
16648 }
16649
16650 /* Mark the previous jump instruction as unlikely.  */
16651
16652 static void
16653 aarch64_emit_unlikely_jump (rtx insn)
16654 {
16655   rtx_insn *jump = emit_jump_insn (insn);
16656   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
16657 }
16658
16659 /* Expand a compare and swap pattern.  */
16660
16661 void
16662 aarch64_expand_compare_and_swap (rtx operands[])
16663 {
16664   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
16665   machine_mode mode, r_mode;
16666
16667   bval = operands[0];
16668   rval = operands[1];
16669   mem = operands[2];
16670   oldval = operands[3];
16671   newval = operands[4];
16672   is_weak = operands[5];
16673   mod_s = operands[6];
16674   mod_f = operands[7];
16675   mode = GET_MODE (mem);
16676
16677   /* Normally the succ memory model must be stronger than fail, but in the
16678      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
16679      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
16680   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
16681       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
16682     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
16683
16684   r_mode = mode;
16685   if (mode == QImode || mode == HImode)
16686     {
16687       r_mode = SImode;
16688       rval = gen_reg_rtx (r_mode);
16689     }
16690
16691   if (TARGET_LSE)
16692     {
16693       /* The CAS insn requires oldval and rval overlap, but we need to
16694          have a copy of oldval saved across the operation to tell if
16695          the operation is successful.  */
16696       if (reg_overlap_mentioned_p (rval, oldval))
16697         rval = copy_to_mode_reg (r_mode, oldval);
16698       else
16699         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
16700
16701       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
16702                                                    newval, mod_s));
16703       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
16704     }
16705   else
16706     {
16707       /* The oldval predicate varies by mode.  Test it and force to reg.  */
16708       insn_code code = code_for_aarch64_compare_and_swap (mode);
16709       if (!insn_data[code].operand[2].predicate (oldval, mode))
16710         oldval = force_reg (mode, oldval);
16711
16712       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
16713                                  is_weak, mod_s, mod_f));
16714       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
16715     }
16716
16717   if (r_mode != mode)
16718     rval = gen_lowpart (mode, rval);
16719   emit_move_insn (operands[1], rval);
16720
16721   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
16722   emit_insn (gen_rtx_SET (bval, x));
16723 }
16724
16725 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
16726    sequence implementing an atomic operation.  */
16727
16728 static void
16729 aarch64_emit_post_barrier (enum memmodel model)
16730 {
16731   const enum memmodel base_model = memmodel_base (model);
16732
16733   if (is_mm_sync (model)
16734       && (base_model == MEMMODEL_ACQUIRE
16735           || base_model == MEMMODEL_ACQ_REL
16736           || base_model == MEMMODEL_SEQ_CST))
16737     {
16738       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
16739     }
16740 }
16741
16742 /* Split a compare and swap pattern.  */
16743
16744 void
16745 aarch64_split_compare_and_swap (rtx operands[])
16746 {
16747   rtx rval, mem, oldval, newval, scratch;
16748   machine_mode mode;
16749   bool is_weak;
16750   rtx_code_label *label1, *label2;
16751   rtx x, cond;
16752   enum memmodel model;
16753   rtx model_rtx;
16754
16755   rval = operands[0];
16756   mem = operands[1];
16757   oldval = operands[2];
16758   newval = operands[3];
16759   is_weak = (operands[4] != const0_rtx);
16760   model_rtx = operands[5];
16761   scratch = operands[7];
16762   mode = GET_MODE (mem);
16763   model = memmodel_from_int (INTVAL (model_rtx));
16764
16765   /* When OLDVAL is zero and we want the strong version we can emit a tighter
16766     loop:
16767     .label1:
16768         LD[A]XR rval, [mem]
16769         CBNZ    rval, .label2
16770         ST[L]XR scratch, newval, [mem]
16771         CBNZ    scratch, .label1
16772     .label2:
16773         CMP     rval, 0.  */
16774   bool strong_zero_p = !is_weak && oldval == const0_rtx;
16775
16776   label1 = NULL;
16777   if (!is_weak)
16778     {
16779       label1 = gen_label_rtx ();
16780       emit_label (label1);
16781     }
16782   label2 = gen_label_rtx ();
16783
16784   /* The initial load can be relaxed for a __sync operation since a final
16785      barrier will be emitted to stop code hoisting.  */
16786   if (is_mm_sync (model))
16787     aarch64_emit_load_exclusive (mode, rval, mem,
16788                                  GEN_INT (MEMMODEL_RELAXED));
16789   else
16790     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
16791
16792   if (strong_zero_p)
16793     {
16794       if (aarch64_track_speculation)
16795         {
16796           /* Emit an explicit compare instruction, so that we can correctly
16797              track the condition codes.  */
16798           rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
16799           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16800         }
16801       else
16802         x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
16803
16804       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16805                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
16806       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16807     }
16808   else
16809     {
16810       cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
16811       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
16812       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16813                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
16814       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16815     }
16816
16817   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
16818
16819   if (!is_weak)
16820     {
16821       if (aarch64_track_speculation)
16822         {
16823           /* Emit an explicit compare instruction, so that we can correctly
16824              track the condition codes.  */
16825           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
16826           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16827         }
16828       else
16829         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
16830
16831       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16832                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
16833       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16834     }
16835   else
16836     {
16837       cond = gen_rtx_REG (CCmode, CC_REGNUM);
16838       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
16839       emit_insn (gen_rtx_SET (cond, x));
16840     }
16841
16842   emit_label (label2);
16843   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
16844      to set the condition flags.  If this is not used it will be removed by
16845      later passes.  */
16846   if (strong_zero_p)
16847     {
16848       cond = gen_rtx_REG (CCmode, CC_REGNUM);
16849       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
16850       emit_insn (gen_rtx_SET (cond, x));
16851     }
16852   /* Emit any final barrier needed for a __sync operation.  */
16853   if (is_mm_sync (model))
16854     aarch64_emit_post_barrier (model);
16855 }
16856
16857 /* Split an atomic operation.  */
16858
16859 void
16860 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
16861                          rtx value, rtx model_rtx, rtx cond)
16862 {
16863   machine_mode mode = GET_MODE (mem);
16864   machine_mode wmode = (mode == DImode ? DImode : SImode);
16865   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
16866   const bool is_sync = is_mm_sync (model);
16867   rtx_code_label *label;
16868   rtx x;
16869
16870   /* Split the atomic operation into a sequence.  */
16871   label = gen_label_rtx ();
16872   emit_label (label);
16873
16874   if (new_out)
16875     new_out = gen_lowpart (wmode, new_out);
16876   if (old_out)
16877     old_out = gen_lowpart (wmode, old_out);
16878   else
16879     old_out = new_out;
16880   value = simplify_gen_subreg (wmode, value, mode, 0);
16881
16882   /* The initial load can be relaxed for a __sync operation since a final
16883      barrier will be emitted to stop code hoisting.  */
16884  if (is_sync)
16885     aarch64_emit_load_exclusive (mode, old_out, mem,
16886                                  GEN_INT (MEMMODEL_RELAXED));
16887   else
16888     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
16889
16890   switch (code)
16891     {
16892     case SET:
16893       new_out = value;
16894       break;
16895
16896     case NOT:
16897       x = gen_rtx_AND (wmode, old_out, value);
16898       emit_insn (gen_rtx_SET (new_out, x));
16899       x = gen_rtx_NOT (wmode, new_out);
16900       emit_insn (gen_rtx_SET (new_out, x));
16901       break;
16902
16903     case MINUS:
16904       if (CONST_INT_P (value))
16905         {
16906           value = GEN_INT (-INTVAL (value));
16907           code = PLUS;
16908         }
16909       /* Fall through.  */
16910
16911     default:
16912       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
16913       emit_insn (gen_rtx_SET (new_out, x));
16914       break;
16915     }
16916
16917   aarch64_emit_store_exclusive (mode, cond, mem,
16918                                 gen_lowpart (mode, new_out), model_rtx);
16919
16920   if (aarch64_track_speculation)
16921     {
16922       /* Emit an explicit compare instruction, so that we can correctly
16923          track the condition codes.  */
16924       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
16925       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16926     }
16927   else
16928     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
16929
16930   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16931                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
16932   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16933
16934   /* Emit any final barrier needed for a __sync operation.  */
16935   if (is_sync)
16936     aarch64_emit_post_barrier (model);
16937 }
16938
16939 static void
16940 aarch64_init_libfuncs (void)
16941 {
16942    /* Half-precision float operations.  The compiler handles all operations
16943      with NULL libfuncs by converting to SFmode.  */
16944
16945   /* Conversions.  */
16946   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
16947   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
16948
16949   /* Arithmetic.  */
16950   set_optab_libfunc (add_optab, HFmode, NULL);
16951   set_optab_libfunc (sdiv_optab, HFmode, NULL);
16952   set_optab_libfunc (smul_optab, HFmode, NULL);
16953   set_optab_libfunc (neg_optab, HFmode, NULL);
16954   set_optab_libfunc (sub_optab, HFmode, NULL);
16955
16956   /* Comparisons.  */
16957   set_optab_libfunc (eq_optab, HFmode, NULL);
16958   set_optab_libfunc (ne_optab, HFmode, NULL);
16959   set_optab_libfunc (lt_optab, HFmode, NULL);
16960   set_optab_libfunc (le_optab, HFmode, NULL);
16961   set_optab_libfunc (ge_optab, HFmode, NULL);
16962   set_optab_libfunc (gt_optab, HFmode, NULL);
16963   set_optab_libfunc (unord_optab, HFmode, NULL);
16964 }
16965
16966 /* Target hook for c_mode_for_suffix.  */
16967 static machine_mode
16968 aarch64_c_mode_for_suffix (char suffix)
16969 {
16970   if (suffix == 'q')
16971     return TFmode;
16972
16973   return VOIDmode;
16974 }
16975
16976 /* We can only represent floating point constants which will fit in
16977    "quarter-precision" values.  These values are characterised by
16978    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
16979    by:
16980
16981    (-1)^s * (n/16) * 2^r
16982
16983    Where:
16984      's' is the sign bit.
16985      'n' is an integer in the range 16 <= n <= 31.
16986      'r' is an integer in the range -3 <= r <= 4.  */
16987
16988 /* Return true iff X can be represented by a quarter-precision
16989    floating point immediate operand X.  Note, we cannot represent 0.0.  */
16990 bool
16991 aarch64_float_const_representable_p (rtx x)
16992 {
16993   /* This represents our current view of how many bits
16994      make up the mantissa.  */
16995   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
16996   int exponent;
16997   unsigned HOST_WIDE_INT mantissa, mask;
16998   REAL_VALUE_TYPE r, m;
16999   bool fail;
17000
17001   x = unwrap_const_vec_duplicate (x);
17002   if (!CONST_DOUBLE_P (x))
17003     return false;
17004
17005   if (GET_MODE (x) == VOIDmode
17006       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
17007     return false;
17008
17009   r = *CONST_DOUBLE_REAL_VALUE (x);
17010
17011   /* We cannot represent infinities, NaNs or +/-zero.  We won't
17012      know if we have +zero until we analyse the mantissa, but we
17013      can reject the other invalid values.  */
17014   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
17015       || REAL_VALUE_MINUS_ZERO (r))
17016     return false;
17017
17018   /* Extract exponent.  */
17019   r = real_value_abs (&r);
17020   exponent = REAL_EXP (&r);
17021
17022   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
17023      highest (sign) bit, with a fixed binary point at bit point_pos.
17024      m1 holds the low part of the mantissa, m2 the high part.
17025      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
17026      bits for the mantissa, this can fail (low bits will be lost).  */
17027   real_ldexp (&m, &r, point_pos - exponent);
17028   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
17029
17030   /* If the low part of the mantissa has bits set we cannot represent
17031      the value.  */
17032   if (w.ulow () != 0)
17033     return false;
17034   /* We have rejected the lower HOST_WIDE_INT, so update our
17035      understanding of how many bits lie in the mantissa and
17036      look only at the high HOST_WIDE_INT.  */
17037   mantissa = w.elt (1);
17038   point_pos -= HOST_BITS_PER_WIDE_INT;
17039
17040   /* We can only represent values with a mantissa of the form 1.xxxx.  */
17041   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
17042   if ((mantissa & mask) != 0)
17043     return false;
17044
17045   /* Having filtered unrepresentable values, we may now remove all
17046      but the highest 5 bits.  */
17047   mantissa >>= point_pos - 5;
17048
17049   /* We cannot represent the value 0.0, so reject it.  This is handled
17050      elsewhere.  */
17051   if (mantissa == 0)
17052     return false;
17053
17054   /* Then, as bit 4 is always set, we can mask it off, leaving
17055      the mantissa in the range [0, 15].  */
17056   mantissa &= ~(1 << 4);
17057   gcc_assert (mantissa <= 15);
17058
17059   /* GCC internally does not use IEEE754-like encoding (where normalized
17060      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
17061      Our mantissa values are shifted 4 places to the left relative to
17062      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
17063      by 5 places to correct for GCC's representation.  */
17064   exponent = 5 - exponent;
17065
17066   return (exponent >= 0 && exponent <= 7);
17067 }
17068
17069 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
17070    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
17071    output MOVI/MVNI, ORR or BIC immediate.  */
17072 char*
17073 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
17074                                    enum simd_immediate_check which)
17075 {
17076   bool is_valid;
17077   static char templ[40];
17078   const char *mnemonic;
17079   const char *shift_op;
17080   unsigned int lane_count = 0;
17081   char element_char;
17082
17083   struct simd_immediate_info info;
17084
17085   /* This will return true to show const_vector is legal for use as either
17086      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
17087      It will also update INFO to show how the immediate should be generated.
17088      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
17089   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
17090   gcc_assert (is_valid);
17091
17092   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
17093   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
17094
17095   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
17096     {
17097       gcc_assert (info.insn == simd_immediate_info::MOV
17098                   && info.u.mov.shift == 0);
17099       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
17100          move immediate path.  */
17101       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
17102         info.u.mov.value = GEN_INT (0);
17103       else
17104         {
17105           const unsigned int buf_size = 20;
17106           char float_buf[buf_size] = {'\0'};
17107           real_to_decimal_for_mode (float_buf,
17108                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17109                                     buf_size, buf_size, 1, info.elt_mode);
17110
17111           if (lane_count == 1)
17112             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
17113           else
17114             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
17115                       lane_count, element_char, float_buf);
17116           return templ;
17117         }
17118     }
17119
17120   gcc_assert (CONST_INT_P (info.u.mov.value));
17121
17122   if (which == AARCH64_CHECK_MOV)
17123     {
17124       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
17125       shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
17126                   ? "msl" : "lsl");
17127       if (lane_count == 1)
17128         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
17129                   mnemonic, UINTVAL (info.u.mov.value));
17130       else if (info.u.mov.shift)
17131         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
17132                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
17133                   element_char, UINTVAL (info.u.mov.value), shift_op,
17134                   info.u.mov.shift);
17135       else
17136         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
17137                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
17138                   element_char, UINTVAL (info.u.mov.value));
17139     }
17140   else
17141     {
17142       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
17143       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
17144       if (info.u.mov.shift)
17145         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
17146                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
17147                   element_char, UINTVAL (info.u.mov.value), "lsl",
17148                   info.u.mov.shift);
17149       else
17150         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
17151                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
17152                   element_char, UINTVAL (info.u.mov.value));
17153     }
17154   return templ;
17155 }
17156
17157 char*
17158 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
17159 {
17160
17161   /* If a floating point number was passed and we desire to use it in an
17162      integer mode do the conversion to integer.  */
17163   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
17164     {
17165       unsigned HOST_WIDE_INT ival;
17166       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
17167           gcc_unreachable ();
17168       immediate = gen_int_mode (ival, mode);
17169     }
17170
17171   machine_mode vmode;
17172   /* use a 64 bit mode for everything except for DI/DF mode, where we use
17173      a 128 bit vector mode.  */
17174   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
17175
17176   vmode = aarch64_simd_container_mode (mode, width);
17177   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
17178   return aarch64_output_simd_mov_immediate (v_op, width);
17179 }
17180
17181 /* Return the output string to use for moving immediate CONST_VECTOR
17182    into an SVE register.  */
17183
17184 char *
17185 aarch64_output_sve_mov_immediate (rtx const_vector)
17186 {
17187   static char templ[40];
17188   struct simd_immediate_info info;
17189   char element_char;
17190
17191   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
17192   gcc_assert (is_valid);
17193
17194   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
17195
17196   machine_mode vec_mode = GET_MODE (const_vector);
17197   if (aarch64_sve_pred_mode_p (vec_mode))
17198     {
17199       static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
17200       if (info.insn == simd_immediate_info::MOV)
17201         {
17202           gcc_assert (info.u.mov.value == const0_rtx);
17203           snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
17204         }
17205       else
17206         {
17207           gcc_assert (info.insn == simd_immediate_info::PTRUE);
17208           unsigned int total_bytes;
17209           if (info.u.pattern == AARCH64_SV_ALL
17210               && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
17211             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
17212                       total_bytes / GET_MODE_SIZE (info.elt_mode));
17213           else
17214             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
17215                       svpattern_token (info.u.pattern));
17216         }
17217       return buf;
17218     }
17219
17220   if (info.insn == simd_immediate_info::INDEX)
17221     {
17222       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
17223                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
17224                 element_char, INTVAL (info.u.index.base),
17225                 INTVAL (info.u.index.step));
17226       return templ;
17227     }
17228
17229   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
17230     {
17231       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
17232         info.u.mov.value = GEN_INT (0);
17233       else
17234         {
17235           const int buf_size = 20;
17236           char float_buf[buf_size] = {};
17237           real_to_decimal_for_mode (float_buf,
17238                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17239                                     buf_size, buf_size, 1, info.elt_mode);
17240
17241           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
17242                     element_char, float_buf);
17243           return templ;
17244         }
17245     }
17246
17247   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
17248             element_char, INTVAL (info.u.mov.value));
17249   return templ;
17250 }
17251
17252 /* Split operands into moves from op[1] + op[2] into op[0].  */
17253
17254 void
17255 aarch64_split_combinev16qi (rtx operands[3])
17256 {
17257   unsigned int dest = REGNO (operands[0]);
17258   unsigned int src1 = REGNO (operands[1]);
17259   unsigned int src2 = REGNO (operands[2]);
17260   machine_mode halfmode = GET_MODE (operands[1]);
17261   unsigned int halfregs = REG_NREGS (operands[1]);
17262   rtx destlo, desthi;
17263
17264   gcc_assert (halfmode == V16QImode);
17265
17266   if (src1 == dest && src2 == dest + halfregs)
17267     {
17268       /* No-op move.  Can't split to nothing; emit something.  */
17269       emit_note (NOTE_INSN_DELETED);
17270       return;
17271     }
17272
17273   /* Preserve register attributes for variable tracking.  */
17274   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
17275   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
17276                                GET_MODE_SIZE (halfmode));
17277
17278   /* Special case of reversed high/low parts.  */
17279   if (reg_overlap_mentioned_p (operands[2], destlo)
17280       && reg_overlap_mentioned_p (operands[1], desthi))
17281     {
17282       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17283       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
17284       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17285     }
17286   else if (!reg_overlap_mentioned_p (operands[2], destlo))
17287     {
17288       /* Try to avoid unnecessary moves if part of the result
17289          is in the right place already.  */
17290       if (src1 != dest)
17291         emit_move_insn (destlo, operands[1]);
17292       if (src2 != dest + halfregs)
17293         emit_move_insn (desthi, operands[2]);
17294     }
17295   else
17296     {
17297       if (src2 != dest + halfregs)
17298         emit_move_insn (desthi, operands[2]);
17299       if (src1 != dest)
17300         emit_move_insn (destlo, operands[1]);
17301     }
17302 }
17303
17304 /* vec_perm support.  */
17305
17306 struct expand_vec_perm_d
17307 {
17308   rtx target, op0, op1;
17309   vec_perm_indices perm;
17310   machine_mode vmode;
17311   unsigned int vec_flags;
17312   bool one_vector_p;
17313   bool testing_p;
17314 };
17315
17316 /* Generate a variable permutation.  */
17317
17318 static void
17319 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
17320 {
17321   machine_mode vmode = GET_MODE (target);
17322   bool one_vector_p = rtx_equal_p (op0, op1);
17323
17324   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
17325   gcc_checking_assert (GET_MODE (op0) == vmode);
17326   gcc_checking_assert (GET_MODE (op1) == vmode);
17327   gcc_checking_assert (GET_MODE (sel) == vmode);
17328   gcc_checking_assert (TARGET_SIMD);
17329
17330   if (one_vector_p)
17331     {
17332       if (vmode == V8QImode)
17333         {
17334           /* Expand the argument to a V16QI mode by duplicating it.  */
17335           rtx pair = gen_reg_rtx (V16QImode);
17336           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
17337           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17338         }
17339       else
17340         {
17341           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
17342         }
17343     }
17344   else
17345     {
17346       rtx pair;
17347
17348       if (vmode == V8QImode)
17349         {
17350           pair = gen_reg_rtx (V16QImode);
17351           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
17352           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17353         }
17354       else
17355         {
17356           pair = gen_reg_rtx (OImode);
17357           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
17358           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
17359         }
17360     }
17361 }
17362
17363 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
17364    NELT is the number of elements in the vector.  */
17365
17366 void
17367 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
17368                          unsigned int nelt)
17369 {
17370   machine_mode vmode = GET_MODE (target);
17371   bool one_vector_p = rtx_equal_p (op0, op1);
17372   rtx mask;
17373
17374   /* The TBL instruction does not use a modulo index, so we must take care
17375      of that ourselves.  */
17376   mask = aarch64_simd_gen_const_vector_dup (vmode,
17377       one_vector_p ? nelt - 1 : 2 * nelt - 1);
17378   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
17379
17380   /* For big-endian, we also need to reverse the index within the vector
17381      (but not which vector).  */
17382   if (BYTES_BIG_ENDIAN)
17383     {
17384       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
17385       if (!one_vector_p)
17386         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
17387       sel = expand_simple_binop (vmode, XOR, sel, mask,
17388                                  NULL, 0, OPTAB_LIB_WIDEN);
17389     }
17390   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
17391 }
17392
17393 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
17394
17395 static void
17396 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
17397 {
17398   emit_insn (gen_rtx_SET (target,
17399                           gen_rtx_UNSPEC (GET_MODE (target),
17400                                           gen_rtvec (2, op0, op1), code)));
17401 }
17402
17403 /* Expand an SVE vec_perm with the given operands.  */
17404
17405 void
17406 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
17407 {
17408   machine_mode data_mode = GET_MODE (target);
17409   machine_mode sel_mode = GET_MODE (sel);
17410   /* Enforced by the pattern condition.  */
17411   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
17412
17413   /* Note: vec_perm indices are supposed to wrap when they go beyond the
17414      size of the two value vectors, i.e. the upper bits of the indices
17415      are effectively ignored.  SVE TBL instead produces 0 for any
17416      out-of-range indices, so we need to modulo all the vec_perm indices
17417      to ensure they are all in range.  */
17418   rtx sel_reg = force_reg (sel_mode, sel);
17419
17420   /* Check if the sel only references the first values vector.  */
17421   if (GET_CODE (sel) == CONST_VECTOR
17422       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
17423     {
17424       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
17425       return;
17426     }
17427
17428   /* Check if the two values vectors are the same.  */
17429   if (rtx_equal_p (op0, op1))
17430     {
17431       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
17432       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17433                                          NULL, 0, OPTAB_DIRECT);
17434       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
17435       return;
17436     }
17437
17438   /* Run TBL on for each value vector and combine the results.  */
17439
17440   rtx res0 = gen_reg_rtx (data_mode);
17441   rtx res1 = gen_reg_rtx (data_mode);
17442   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
17443   if (GET_CODE (sel) != CONST_VECTOR
17444       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
17445     {
17446       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
17447                                                        2 * nunits - 1);
17448       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17449                                      NULL, 0, OPTAB_DIRECT);
17450     }
17451   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
17452   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
17453                                      NULL, 0, OPTAB_DIRECT);
17454   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
17455   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
17456     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
17457   else
17458     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
17459 }
17460
17461 /* Recognize patterns suitable for the TRN instructions.  */
17462 static bool
17463 aarch64_evpc_trn (struct expand_vec_perm_d *d)
17464 {
17465   HOST_WIDE_INT odd;
17466   poly_uint64 nelt = d->perm.length ();
17467   rtx out, in0, in1, x;
17468   machine_mode vmode = d->vmode;
17469
17470   if (GET_MODE_UNIT_SIZE (vmode) > 8)
17471     return false;
17472
17473   /* Note that these are little-endian tests.
17474      We correct for big-endian later.  */
17475   if (!d->perm[0].is_constant (&odd)
17476       || (odd != 0 && odd != 1)
17477       || !d->perm.series_p (0, 2, odd, 2)
17478       || !d->perm.series_p (1, 2, nelt + odd, 2))
17479     return false;
17480
17481   /* Success!  */
17482   if (d->testing_p)
17483     return true;
17484
17485   in0 = d->op0;
17486   in1 = d->op1;
17487   /* We don't need a big-endian lane correction for SVE; see the comment
17488      at the head of aarch64-sve.md for details.  */
17489   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17490     {
17491       x = in0, in0 = in1, in1 = x;
17492       odd = !odd;
17493     }
17494   out = d->target;
17495
17496   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17497                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
17498   return true;
17499 }
17500
17501 /* Recognize patterns suitable for the UZP instructions.  */
17502 static bool
17503 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
17504 {
17505   HOST_WIDE_INT odd;
17506   rtx out, in0, in1, x;
17507   machine_mode vmode = d->vmode;
17508
17509   if (GET_MODE_UNIT_SIZE (vmode) > 8)
17510     return false;
17511
17512   /* Note that these are little-endian tests.
17513      We correct for big-endian later.  */
17514   if (!d->perm[0].is_constant (&odd)
17515       || (odd != 0 && odd != 1)
17516       || !d->perm.series_p (0, 1, odd, 2))
17517     return false;
17518
17519   /* Success!  */
17520   if (d->testing_p)
17521     return true;
17522
17523   in0 = d->op0;
17524   in1 = d->op1;
17525   /* We don't need a big-endian lane correction for SVE; see the comment
17526      at the head of aarch64-sve.md for details.  */
17527   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17528     {
17529       x = in0, in0 = in1, in1 = x;
17530       odd = !odd;
17531     }
17532   out = d->target;
17533
17534   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17535                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
17536   return true;
17537 }
17538
17539 /* Recognize patterns suitable for the ZIP instructions.  */
17540 static bool
17541 aarch64_evpc_zip (struct expand_vec_perm_d *d)
17542 {
17543   unsigned int high;
17544   poly_uint64 nelt = d->perm.length ();
17545   rtx out, in0, in1, x;
17546   machine_mode vmode = d->vmode;
17547
17548   if (GET_MODE_UNIT_SIZE (vmode) > 8)
17549     return false;
17550
17551   /* Note that these are little-endian tests.
17552      We correct for big-endian later.  */
17553   poly_uint64 first = d->perm[0];
17554   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
17555       || !d->perm.series_p (0, 2, first, 1)
17556       || !d->perm.series_p (1, 2, first + nelt, 1))
17557     return false;
17558   high = maybe_ne (first, 0U);
17559
17560   /* Success!  */
17561   if (d->testing_p)
17562     return true;
17563
17564   in0 = d->op0;
17565   in1 = d->op1;
17566   /* We don't need a big-endian lane correction for SVE; see the comment
17567      at the head of aarch64-sve.md for details.  */
17568   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17569     {
17570       x = in0, in0 = in1, in1 = x;
17571       high = !high;
17572     }
17573   out = d->target;
17574
17575   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17576                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
17577   return true;
17578 }
17579
17580 /* Recognize patterns for the EXT insn.  */
17581
17582 static bool
17583 aarch64_evpc_ext (struct expand_vec_perm_d *d)
17584 {
17585   HOST_WIDE_INT location;
17586   rtx offset;
17587
17588   /* The first element always refers to the first vector.
17589      Check if the extracted indices are increasing by one.  */
17590   if (d->vec_flags == VEC_SVE_PRED
17591       || !d->perm[0].is_constant (&location)
17592       || !d->perm.series_p (0, 1, location, 1))
17593     return false;
17594
17595   /* Success! */
17596   if (d->testing_p)
17597     return true;
17598
17599   /* The case where (location == 0) is a no-op for both big- and little-endian,
17600      and is removed by the mid-end at optimization levels -O1 and higher.
17601
17602      We don't need a big-endian lane correction for SVE; see the comment
17603      at the head of aarch64-sve.md for details.  */
17604   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
17605     {
17606       /* After setup, we want the high elements of the first vector (stored
17607          at the LSB end of the register), and the low elements of the second
17608          vector (stored at the MSB end of the register). So swap.  */
17609       std::swap (d->op0, d->op1);
17610       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
17611          to_constant () is safe since this is restricted to Advanced SIMD
17612          vectors.  */
17613       location = d->perm.length ().to_constant () - location;
17614     }
17615
17616   offset = GEN_INT (location);
17617   emit_set_insn (d->target,
17618                  gen_rtx_UNSPEC (d->vmode,
17619                                  gen_rtvec (3, d->op0, d->op1, offset),
17620                                  UNSPEC_EXT));
17621   return true;
17622 }
17623
17624 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
17625    within each 64-bit, 32-bit or 16-bit granule.  */
17626
17627 static bool
17628 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
17629 {
17630   HOST_WIDE_INT diff;
17631   unsigned int i, size, unspec;
17632   machine_mode pred_mode;
17633
17634   if (d->vec_flags == VEC_SVE_PRED
17635       || !d->one_vector_p
17636       || !d->perm[0].is_constant (&diff))
17637     return false;
17638
17639   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
17640   if (size == 8)
17641     {
17642       unspec = UNSPEC_REV64;
17643       pred_mode = VNx2BImode;
17644     }
17645   else if (size == 4)
17646     {
17647       unspec = UNSPEC_REV32;
17648       pred_mode = VNx4BImode;
17649     }
17650   else if (size == 2)
17651     {
17652       unspec = UNSPEC_REV16;
17653       pred_mode = VNx8BImode;
17654     }
17655   else
17656     return false;
17657
17658   unsigned int step = diff + 1;
17659   for (i = 0; i < step; ++i)
17660     if (!d->perm.series_p (i, step, diff - i, step))
17661       return false;
17662
17663   /* Success! */
17664   if (d->testing_p)
17665     return true;
17666
17667   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
17668   if (d->vec_flags == VEC_SVE_DATA)
17669     {
17670       rtx pred = aarch64_ptrue_reg (pred_mode);
17671       src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
17672                             UNSPEC_PRED_X);
17673     }
17674   emit_set_insn (d->target, src);
17675   return true;
17676 }
17677
17678 /* Recognize patterns for the REV insn, which reverses elements within
17679    a full vector.  */
17680
17681 static bool
17682 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
17683 {
17684   poly_uint64 nelt = d->perm.length ();
17685
17686   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
17687     return false;
17688
17689   if (!d->perm.series_p (0, 1, nelt - 1, -1))
17690     return false;
17691
17692   /* Success! */
17693   if (d->testing_p)
17694     return true;
17695
17696   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
17697   emit_set_insn (d->target, src);
17698   return true;
17699 }
17700
17701 static bool
17702 aarch64_evpc_dup (struct expand_vec_perm_d *d)
17703 {
17704   rtx out = d->target;
17705   rtx in0;
17706   HOST_WIDE_INT elt;
17707   machine_mode vmode = d->vmode;
17708   rtx lane;
17709
17710   if (d->vec_flags == VEC_SVE_PRED
17711       || d->perm.encoding ().encoded_nelts () != 1
17712       || !d->perm[0].is_constant (&elt))
17713     return false;
17714
17715   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
17716     return false;
17717
17718   /* Success! */
17719   if (d->testing_p)
17720     return true;
17721
17722   /* The generic preparation in aarch64_expand_vec_perm_const_1
17723      swaps the operand order and the permute indices if it finds
17724      d->perm[0] to be in the second operand.  Thus, we can always
17725      use d->op0 and need not do any extra arithmetic to get the
17726      correct lane number.  */
17727   in0 = d->op0;
17728   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
17729
17730   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
17731   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
17732   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
17733   return true;
17734 }
17735
17736 static bool
17737 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
17738 {
17739   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
17740   machine_mode vmode = d->vmode;
17741
17742   /* Make sure that the indices are constant.  */
17743   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
17744   for (unsigned int i = 0; i < encoded_nelts; ++i)
17745     if (!d->perm[i].is_constant ())
17746       return false;
17747
17748   if (d->testing_p)
17749     return true;
17750
17751   /* Generic code will try constant permutation twice.  Once with the
17752      original mode and again with the elements lowered to QImode.
17753      So wait and don't do the selector expansion ourselves.  */
17754   if (vmode != V8QImode && vmode != V16QImode)
17755     return false;
17756
17757   /* to_constant is safe since this routine is specific to Advanced SIMD
17758      vectors.  */
17759   unsigned int nelt = d->perm.length ().to_constant ();
17760   for (unsigned int i = 0; i < nelt; ++i)
17761     /* If big-endian and two vectors we end up with a weird mixed-endian
17762        mode on NEON.  Reverse the index within each word but not the word
17763        itself.  to_constant is safe because we checked is_constant above.  */
17764     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
17765                         ? d->perm[i].to_constant () ^ (nelt - 1)
17766                         : d->perm[i].to_constant ());
17767
17768   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
17769   sel = force_reg (vmode, sel);
17770
17771   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
17772   return true;
17773 }
17774
17775 /* Try to implement D using an SVE TBL instruction.  */
17776
17777 static bool
17778 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
17779 {
17780   unsigned HOST_WIDE_INT nelt;
17781
17782   /* Permuting two variable-length vectors could overflow the
17783      index range.  */
17784   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
17785     return false;
17786
17787   if (d->testing_p)
17788     return true;
17789
17790   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
17791   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
17792   if (d->one_vector_p)
17793     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
17794   else
17795     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
17796   return true;
17797 }
17798
17799 static bool
17800 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
17801 {
17802   /* The pattern matching functions above are written to look for a small
17803      number to begin the sequence (0, 1, N/2).  If we begin with an index
17804      from the second operand, we can swap the operands.  */
17805   poly_int64 nelt = d->perm.length ();
17806   if (known_ge (d->perm[0], nelt))
17807     {
17808       d->perm.rotate_inputs (1);
17809       std::swap (d->op0, d->op1);
17810     }
17811
17812   if ((d->vec_flags == VEC_ADVSIMD
17813        || d->vec_flags == VEC_SVE_DATA
17814        || d->vec_flags == VEC_SVE_PRED)
17815       && known_gt (nelt, 1))
17816     {
17817       if (aarch64_evpc_rev_local (d))
17818         return true;
17819       else if (aarch64_evpc_rev_global (d))
17820         return true;
17821       else if (aarch64_evpc_ext (d))
17822         return true;
17823       else if (aarch64_evpc_dup (d))
17824         return true;
17825       else if (aarch64_evpc_zip (d))
17826         return true;
17827       else if (aarch64_evpc_uzp (d))
17828         return true;
17829       else if (aarch64_evpc_trn (d))
17830         return true;
17831       if (d->vec_flags == VEC_SVE_DATA)
17832         return aarch64_evpc_sve_tbl (d);
17833       else if (d->vec_flags == VEC_ADVSIMD)
17834         return aarch64_evpc_tbl (d);
17835     }
17836   return false;
17837 }
17838
17839 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
17840
17841 static bool
17842 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
17843                                   rtx op1, const vec_perm_indices &sel)
17844 {
17845   struct expand_vec_perm_d d;
17846
17847   /* Check whether the mask can be applied to a single vector.  */
17848   if (sel.ninputs () == 1
17849       || (op0 && rtx_equal_p (op0, op1)))
17850     d.one_vector_p = true;
17851   else if (sel.all_from_input_p (0))
17852     {
17853       d.one_vector_p = true;
17854       op1 = op0;
17855     }
17856   else if (sel.all_from_input_p (1))
17857     {
17858       d.one_vector_p = true;
17859       op0 = op1;
17860     }
17861   else
17862     d.one_vector_p = false;
17863
17864   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
17865                      sel.nelts_per_input ());
17866   d.vmode = vmode;
17867   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
17868   d.target = target;
17869   d.op0 = op0;
17870   d.op1 = op1;
17871   d.testing_p = !target;
17872
17873   if (!d.testing_p)
17874     return aarch64_expand_vec_perm_const_1 (&d);
17875
17876   rtx_insn *last = get_last_insn ();
17877   bool ret = aarch64_expand_vec_perm_const_1 (&d);
17878   gcc_assert (last == get_last_insn ());
17879
17880   return ret;
17881 }
17882
17883 /* Generate a byte permute mask for a register of mode MODE,
17884    which has NUNITS units.  */
17885
17886 rtx
17887 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
17888 {
17889   /* We have to reverse each vector because we dont have
17890      a permuted load that can reverse-load according to ABI rules.  */
17891   rtx mask;
17892   rtvec v = rtvec_alloc (16);
17893   unsigned int i, j;
17894   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
17895
17896   gcc_assert (BYTES_BIG_ENDIAN);
17897   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
17898
17899   for (i = 0; i < nunits; i++)
17900     for (j = 0; j < usize; j++)
17901       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
17902   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
17903   return force_reg (V16QImode, mask);
17904 }
17905
17906 /* Expand an SVE integer comparison using the SVE equivalent of:
17907
17908      (set TARGET (CODE OP0 OP1)).  */
17909
17910 void
17911 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
17912 {
17913   machine_mode pred_mode = GET_MODE (target);
17914   machine_mode data_mode = GET_MODE (op0);
17915   rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
17916                                       op0, op1);
17917   if (!rtx_equal_p (target, res))
17918     emit_move_insn (target, res);
17919 }
17920
17921 /* Return the UNSPEC_COND_* code for comparison CODE.  */
17922
17923 static unsigned int
17924 aarch64_unspec_cond_code (rtx_code code)
17925 {
17926   switch (code)
17927     {
17928     case NE:
17929       return UNSPEC_COND_FCMNE;
17930     case EQ:
17931       return UNSPEC_COND_FCMEQ;
17932     case LT:
17933       return UNSPEC_COND_FCMLT;
17934     case GT:
17935       return UNSPEC_COND_FCMGT;
17936     case LE:
17937       return UNSPEC_COND_FCMLE;
17938     case GE:
17939       return UNSPEC_COND_FCMGE;
17940     case UNORDERED:
17941       return UNSPEC_COND_FCMUO;
17942     default:
17943       gcc_unreachable ();
17944     }
17945 }
17946
17947 /* Emit:
17948
17949       (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
17950
17951    where <X> is the operation associated with comparison CODE.
17952    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
17953
17954 static void
17955 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
17956                           bool known_ptrue_p, rtx op0, rtx op1)
17957 {
17958   rtx flag = gen_int_mode (known_ptrue_p, SImode);
17959   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
17960                                gen_rtvec (4, pred, flag, op0, op1),
17961                                aarch64_unspec_cond_code (code));
17962   emit_set_insn (target, unspec);
17963 }
17964
17965 /* Emit the SVE equivalent of:
17966
17967       (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
17968       (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
17969       (set TARGET (ior:PRED_MODE TMP1 TMP2))
17970
17971    where <Xi> is the operation associated with comparison CODEi.
17972    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
17973
17974 static void
17975 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
17976                               rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
17977 {
17978   machine_mode pred_mode = GET_MODE (pred);
17979   rtx tmp1 = gen_reg_rtx (pred_mode);
17980   aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
17981   rtx tmp2 = gen_reg_rtx (pred_mode);
17982   aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
17983   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
17984 }
17985
17986 /* Emit the SVE equivalent of:
17987
17988       (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
17989       (set TARGET (not TMP))
17990
17991    where <X> is the operation associated with comparison CODE.
17992    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
17993
17994 static void
17995 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
17996                                  bool known_ptrue_p, rtx op0, rtx op1)
17997 {
17998   machine_mode pred_mode = GET_MODE (pred);
17999   rtx tmp = gen_reg_rtx (pred_mode);
18000   aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
18001   aarch64_emit_unop (target, one_cmpl_optab, tmp);
18002 }
18003
18004 /* Expand an SVE floating-point comparison using the SVE equivalent of:
18005
18006      (set TARGET (CODE OP0 OP1))
18007
18008    If CAN_INVERT_P is true, the caller can also handle inverted results;
18009    return true if the result is in fact inverted.  */
18010
18011 bool
18012 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
18013                                   rtx op0, rtx op1, bool can_invert_p)
18014 {
18015   machine_mode pred_mode = GET_MODE (target);
18016   machine_mode data_mode = GET_MODE (op0);
18017
18018   rtx ptrue = aarch64_ptrue_reg (pred_mode);
18019   switch (code)
18020     {
18021     case UNORDERED:
18022       /* UNORDERED has no immediate form.  */
18023       op1 = force_reg (data_mode, op1);
18024       /* fall through */
18025     case LT:
18026     case LE:
18027     case GT:
18028     case GE:
18029     case EQ:
18030     case NE:
18031       {
18032         /* There is native support for the comparison.  */
18033         aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
18034         return false;
18035       }
18036
18037     case LTGT:
18038       /* This is a trapping operation (LT or GT).  */
18039       aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
18040       return false;
18041
18042     case UNEQ:
18043       if (!flag_trapping_math)
18044         {
18045           /* This would trap for signaling NaNs.  */
18046           op1 = force_reg (data_mode, op1);
18047           aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
18048                                         ptrue, true, op0, op1);
18049           return false;
18050         }
18051       /* fall through */
18052     case UNLT:
18053     case UNLE:
18054     case UNGT:
18055     case UNGE:
18056       if (flag_trapping_math)
18057         {
18058           /* Work out which elements are ordered.  */
18059           rtx ordered = gen_reg_rtx (pred_mode);
18060           op1 = force_reg (data_mode, op1);
18061           aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
18062                                            ptrue, true, op0, op1);
18063
18064           /* Test the opposite condition for the ordered elements,
18065              then invert the result.  */
18066           if (code == UNEQ)
18067             code = NE;
18068           else
18069             code = reverse_condition_maybe_unordered (code);
18070           if (can_invert_p)
18071             {
18072               aarch64_emit_sve_fp_cond (target, code,
18073                                         ordered, false, op0, op1);
18074               return true;
18075             }
18076           aarch64_emit_sve_invert_fp_cond (target, code,
18077                                            ordered, false, op0, op1);
18078           return false;
18079         }
18080       break;
18081
18082     case ORDERED:
18083       /* ORDERED has no immediate form.  */
18084       op1 = force_reg (data_mode, op1);
18085       break;
18086
18087     default:
18088       gcc_unreachable ();
18089     }
18090
18091   /* There is native support for the inverse comparison.  */
18092   code = reverse_condition_maybe_unordered (code);
18093   if (can_invert_p)
18094     {
18095       aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
18096       return true;
18097     }
18098   aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
18099   return false;
18100 }
18101
18102 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
18103    of the data being selected and CMP_MODE is the mode of the values being
18104    compared.  */
18105
18106 void
18107 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
18108                           rtx *ops)
18109 {
18110   machine_mode pred_mode
18111     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
18112                              GET_MODE_SIZE (cmp_mode)).require ();
18113   rtx pred = gen_reg_rtx (pred_mode);
18114   if (FLOAT_MODE_P (cmp_mode))
18115     {
18116       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
18117                                             ops[4], ops[5], true))
18118         std::swap (ops[1], ops[2]);
18119     }
18120   else
18121     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
18122
18123   if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
18124     ops[1] = force_reg (data_mode, ops[1]);
18125   /* The "false" value can only be zero if the "true" value is a constant.  */
18126   if (register_operand (ops[1], data_mode)
18127       || !aarch64_simd_reg_or_zero (ops[2], data_mode))
18128     ops[2] = force_reg (data_mode, ops[2]);
18129
18130   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
18131   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
18132 }
18133
18134 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
18135    true.  However due to issues with register allocation it is preferable
18136    to avoid tieing integer scalar and FP scalar modes.  Executing integer
18137    operations in general registers is better than treating them as scalar
18138    vector operations.  This reduces latency and avoids redundant int<->FP
18139    moves.  So tie modes if they are either the same class, or vector modes
18140    with other vector modes, vector structs or any scalar mode.  */
18141
18142 static bool
18143 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
18144 {
18145   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
18146     return true;
18147
18148   /* We specifically want to allow elements of "structure" modes to
18149      be tieable to the structure.  This more general condition allows
18150      other rarer situations too.  The reason we don't extend this to
18151      predicate modes is that there are no predicate structure modes
18152      nor any specific instructions for extracting part of a predicate
18153      register.  */
18154   if (aarch64_vector_data_mode_p (mode1)
18155       && aarch64_vector_data_mode_p (mode2))
18156     return true;
18157
18158   /* Also allow any scalar modes with vectors.  */
18159   if (aarch64_vector_mode_supported_p (mode1)
18160       || aarch64_vector_mode_supported_p (mode2))
18161     return true;
18162
18163   return false;
18164 }
18165
18166 /* Return a new RTX holding the result of moving POINTER forward by
18167    AMOUNT bytes.  */
18168
18169 static rtx
18170 aarch64_move_pointer (rtx pointer, poly_int64 amount)
18171 {
18172   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
18173
18174   return adjust_automodify_address (pointer, GET_MODE (pointer),
18175                                     next, amount);
18176 }
18177
18178 /* Return a new RTX holding the result of moving POINTER forward by the
18179    size of the mode it points to.  */
18180
18181 static rtx
18182 aarch64_progress_pointer (rtx pointer)
18183 {
18184   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
18185 }
18186
18187 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
18188    MODE bytes.  */
18189
18190 static void
18191 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
18192                                               machine_mode mode)
18193 {
18194   rtx reg = gen_reg_rtx (mode);
18195
18196   /* "Cast" the pointers to the correct mode.  */
18197   *src = adjust_address (*src, mode, 0);
18198   *dst = adjust_address (*dst, mode, 0);
18199   /* Emit the memcpy.  */
18200   emit_move_insn (reg, *src);
18201   emit_move_insn (*dst, reg);
18202   /* Move the pointers forward.  */
18203   *src = aarch64_progress_pointer (*src);
18204   *dst = aarch64_progress_pointer (*dst);
18205 }
18206
18207 /* Expand cpymem, as if from a __builtin_memcpy.  Return true if
18208    we succeed, otherwise return false.  */
18209
18210 bool
18211 aarch64_expand_cpymem (rtx *operands)
18212 {
18213   int n, mode_bits;
18214   rtx dst = operands[0];
18215   rtx src = operands[1];
18216   rtx base;
18217   machine_mode cur_mode = BLKmode, next_mode;
18218   bool speed_p = !optimize_function_for_size_p (cfun);
18219
18220   /* When optimizing for size, give a better estimate of the length of a
18221      memcpy call, but use the default otherwise.  Moves larger than 8 bytes
18222      will always require an even number of instructions to do now.  And each
18223      operation requires both a load+store, so devide the max number by 2.  */
18224   int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
18225
18226   /* We can't do anything smart if the amount to copy is not constant.  */
18227   if (!CONST_INT_P (operands[2]))
18228     return false;
18229
18230   n = INTVAL (operands[2]);
18231
18232   /* Try to keep the number of instructions low.  For all cases we will do at
18233      most two moves for the residual amount, since we'll always overlap the
18234      remainder.  */
18235   if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
18236     return false;
18237
18238   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
18239   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
18240
18241   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
18242   src = adjust_automodify_address (src, VOIDmode, base, 0);
18243
18244   /* Convert n to bits to make the rest of the code simpler.  */
18245   n = n * BITS_PER_UNIT;
18246
18247   /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
18248      larger than TImode, but we should not use them for loads/stores here.  */
18249   const int copy_limit = GET_MODE_BITSIZE (TImode);
18250
18251   while (n > 0)
18252     {
18253       /* Find the largest mode in which to do the copy in without over reading
18254          or writing.  */
18255       opt_scalar_int_mode mode_iter;
18256       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
18257         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
18258           cur_mode = mode_iter.require ();
18259
18260       gcc_assert (cur_mode != BLKmode);
18261
18262       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
18263       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
18264
18265       n -= mode_bits;
18266
18267       /* Do certain trailing copies as overlapping if it's going to be
18268          cheaper.  i.e. less instructions to do so.  For instance doing a 15
18269          byte copy it's more efficient to do two overlapping 8 byte copies than
18270          8 + 6 + 1.  */
18271       if (n > 0 && n <= 8 * BITS_PER_UNIT)
18272         {
18273           next_mode = smallest_mode_for_size (n, MODE_INT);
18274           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
18275           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
18276           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
18277           n = n_bits;
18278         }
18279     }
18280
18281   return true;
18282 }
18283
18284 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
18285    SImode stores.  Handle the case when the constant has identical
18286    bottom and top halves.  This is beneficial when the two stores can be
18287    merged into an STP and we avoid synthesising potentially expensive
18288    immediates twice.  Return true if such a split is possible.  */
18289
18290 bool
18291 aarch64_split_dimode_const_store (rtx dst, rtx src)
18292 {
18293   rtx lo = gen_lowpart (SImode, src);
18294   rtx hi = gen_highpart_mode (SImode, DImode, src);
18295
18296   bool size_p = optimize_function_for_size_p (cfun);
18297
18298   if (!rtx_equal_p (lo, hi))
18299     return false;
18300
18301   unsigned int orig_cost
18302     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
18303   unsigned int lo_cost
18304     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
18305
18306   /* We want to transform:
18307      MOV        x1, 49370
18308      MOVK       x1, 0x140, lsl 16
18309      MOVK       x1, 0xc0da, lsl 32
18310      MOVK       x1, 0x140, lsl 48
18311      STR        x1, [x0]
18312    into:
18313      MOV        w1, 49370
18314      MOVK       w1, 0x140, lsl 16
18315      STP        w1, w1, [x0]
18316    So we want to perform this only when we save two instructions
18317    or more.  When optimizing for size, however, accept any code size
18318    savings we can.  */
18319   if (size_p && orig_cost <= lo_cost)
18320     return false;
18321
18322   if (!size_p
18323       && (orig_cost <= lo_cost + 1))
18324     return false;
18325
18326   rtx mem_lo = adjust_address (dst, SImode, 0);
18327   if (!aarch64_mem_pair_operand (mem_lo, SImode))
18328     return false;
18329
18330   rtx tmp_reg = gen_reg_rtx (SImode);
18331   aarch64_expand_mov_immediate (tmp_reg, lo);
18332   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
18333   /* Don't emit an explicit store pair as this may not be always profitable.
18334      Let the sched-fusion logic decide whether to merge them.  */
18335   emit_move_insn (mem_lo, tmp_reg);
18336   emit_move_insn (mem_hi, tmp_reg);
18337
18338   return true;
18339 }
18340
18341 /* Generate RTL for a conditional branch with rtx comparison CODE in
18342    mode CC_MODE.  The destination of the unlikely conditional branch
18343    is LABEL_REF.  */
18344
18345 void
18346 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
18347                               rtx label_ref)
18348 {
18349   rtx x;
18350   x = gen_rtx_fmt_ee (code, VOIDmode,
18351                       gen_rtx_REG (cc_mode, CC_REGNUM),
18352                       const0_rtx);
18353
18354   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18355                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
18356                             pc_rtx);
18357   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18358 }
18359
18360 /* Generate DImode scratch registers for 128-bit (TImode) addition.
18361
18362    OP1 represents the TImode destination operand 1
18363    OP2 represents the TImode destination operand 2
18364    LOW_DEST represents the low half (DImode) of TImode operand 0
18365    LOW_IN1 represents the low half (DImode) of TImode operand 1
18366    LOW_IN2 represents the low half (DImode) of TImode operand 2
18367    HIGH_DEST represents the high half (DImode) of TImode operand 0
18368    HIGH_IN1 represents the high half (DImode) of TImode operand 1
18369    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
18370
18371 void
18372 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18373                             rtx *low_in1, rtx *low_in2,
18374                             rtx *high_dest, rtx *high_in1,
18375                             rtx *high_in2)
18376 {
18377   *low_dest = gen_reg_rtx (DImode);
18378   *low_in1 = gen_lowpart (DImode, op1);
18379   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18380                                   subreg_lowpart_offset (DImode, TImode));
18381   *high_dest = gen_reg_rtx (DImode);
18382   *high_in1 = gen_highpart (DImode, op1);
18383   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18384                                    subreg_highpart_offset (DImode, TImode));
18385 }
18386
18387 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
18388
18389    This function differs from 'arch64_addti_scratch_regs' in that
18390    OP1 can be an immediate constant (zero). We must call
18391    subreg_highpart_offset with DImode and TImode arguments, otherwise
18392    VOIDmode will be used for the const_int which generates an internal
18393    error from subreg_size_highpart_offset which does not expect a size of zero.
18394
18395    OP1 represents the TImode destination operand 1
18396    OP2 represents the TImode destination operand 2
18397    LOW_DEST represents the low half (DImode) of TImode operand 0
18398    LOW_IN1 represents the low half (DImode) of TImode operand 1
18399    LOW_IN2 represents the low half (DImode) of TImode operand 2
18400    HIGH_DEST represents the high half (DImode) of TImode operand 0
18401    HIGH_IN1 represents the high half (DImode) of TImode operand 1
18402    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
18403
18404
18405 void
18406 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18407                              rtx *low_in1, rtx *low_in2,
18408                              rtx *high_dest, rtx *high_in1,
18409                              rtx *high_in2)
18410 {
18411   *low_dest = gen_reg_rtx (DImode);
18412   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
18413                                   subreg_lowpart_offset (DImode, TImode));
18414
18415   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18416                                   subreg_lowpart_offset (DImode, TImode));
18417   *high_dest = gen_reg_rtx (DImode);
18418
18419   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
18420                                    subreg_highpart_offset (DImode, TImode));
18421   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18422                                    subreg_highpart_offset (DImode, TImode));
18423 }
18424
18425 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
18426
18427    OP0 represents the TImode destination operand 0
18428    LOW_DEST represents the low half (DImode) of TImode operand 0
18429    LOW_IN1 represents the low half (DImode) of TImode operand 1
18430    LOW_IN2 represents the low half (DImode) of TImode operand 2
18431    HIGH_DEST represents the high half (DImode) of TImode operand 0
18432    HIGH_IN1 represents the high half (DImode) of TImode operand 1
18433    HIGH_IN2 represents the high half (DImode) of TImode operand 2
18434    UNSIGNED_P is true if the operation is being performed on unsigned
18435    values.  */
18436 void
18437 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
18438                        rtx low_in2, rtx high_dest, rtx high_in1,
18439                        rtx high_in2, bool unsigned_p)
18440 {
18441   if (low_in2 == const0_rtx)
18442     {
18443       low_dest = low_in1;
18444       high_in2 = force_reg (DImode, high_in2);
18445       if (unsigned_p)
18446         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
18447       else
18448         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
18449     }
18450   else
18451     {
18452       if (CONST_INT_P (low_in2))
18453         {
18454           high_in2 = force_reg (DImode, high_in2);
18455           emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
18456                                               GEN_INT (-INTVAL (low_in2))));
18457         }
18458       else
18459         emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
18460
18461       if (unsigned_p)
18462         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
18463       else
18464         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
18465     }
18466
18467   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
18468   emit_move_insn (gen_highpart (DImode, op0), high_dest);
18469
18470 }
18471
18472 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
18473
18474 static unsigned HOST_WIDE_INT
18475 aarch64_asan_shadow_offset (void)
18476 {
18477   if (TARGET_ILP32)
18478     return (HOST_WIDE_INT_1 << 29);
18479   else
18480     return (HOST_WIDE_INT_1 << 36);
18481 }
18482
18483 static rtx
18484 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
18485                         int code, tree treeop0, tree treeop1)
18486 {
18487   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18488   rtx op0, op1;
18489   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18490   insn_code icode;
18491   struct expand_operand ops[4];
18492
18493   start_sequence ();
18494   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18495
18496   op_mode = GET_MODE (op0);
18497   if (op_mode == VOIDmode)
18498     op_mode = GET_MODE (op1);
18499
18500   switch (op_mode)
18501     {
18502     case E_QImode:
18503     case E_HImode:
18504     case E_SImode:
18505       cmp_mode = SImode;
18506       icode = CODE_FOR_cmpsi;
18507       break;
18508
18509     case E_DImode:
18510       cmp_mode = DImode;
18511       icode = CODE_FOR_cmpdi;
18512       break;
18513
18514     case E_SFmode:
18515       cmp_mode = SFmode;
18516       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18517       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
18518       break;
18519
18520     case E_DFmode:
18521       cmp_mode = DFmode;
18522       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18523       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
18524       break;
18525
18526     default:
18527       end_sequence ();
18528       return NULL_RTX;
18529     }
18530
18531   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
18532   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
18533   if (!op0 || !op1)
18534     {
18535       end_sequence ();
18536       return NULL_RTX;
18537     }
18538   *prep_seq = get_insns ();
18539   end_sequence ();
18540
18541   create_fixed_operand (&ops[0], op0);
18542   create_fixed_operand (&ops[1], op1);
18543
18544   start_sequence ();
18545   if (!maybe_expand_insn (icode, 2, ops))
18546     {
18547       end_sequence ();
18548       return NULL_RTX;
18549     }
18550   *gen_seq = get_insns ();
18551   end_sequence ();
18552
18553   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
18554                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
18555 }
18556
18557 static rtx
18558 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
18559                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
18560 {
18561   rtx op0, op1, target;
18562   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18563   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18564   insn_code icode;
18565   struct expand_operand ops[6];
18566   int aarch64_cond;
18567
18568   push_to_sequence (*prep_seq);
18569   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18570
18571   op_mode = GET_MODE (op0);
18572   if (op_mode == VOIDmode)
18573     op_mode = GET_MODE (op1);
18574
18575   switch (op_mode)
18576     {
18577     case E_QImode:
18578     case E_HImode:
18579     case E_SImode:
18580       cmp_mode = SImode;
18581       icode = CODE_FOR_ccmpsi;
18582       break;
18583
18584     case E_DImode:
18585       cmp_mode = DImode;
18586       icode = CODE_FOR_ccmpdi;
18587       break;
18588
18589     case E_SFmode:
18590       cmp_mode = SFmode;
18591       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
18592       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
18593       break;
18594
18595     case E_DFmode:
18596       cmp_mode = DFmode;
18597       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
18598       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
18599       break;
18600
18601     default:
18602       end_sequence ();
18603       return NULL_RTX;
18604     }
18605
18606   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
18607   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
18608   if (!op0 || !op1)
18609     {
18610       end_sequence ();
18611       return NULL_RTX;
18612     }
18613   *prep_seq = get_insns ();
18614   end_sequence ();
18615
18616   target = gen_rtx_REG (cc_mode, CC_REGNUM);
18617   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
18618
18619   if (bit_code != AND)
18620     {
18621       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
18622                                                 GET_MODE (XEXP (prev, 0))),
18623                              VOIDmode, XEXP (prev, 0), const0_rtx);
18624       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
18625     }
18626
18627   create_fixed_operand (&ops[0], XEXP (prev, 0));
18628   create_fixed_operand (&ops[1], target);
18629   create_fixed_operand (&ops[2], op0);
18630   create_fixed_operand (&ops[3], op1);
18631   create_fixed_operand (&ops[4], prev);
18632   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
18633
18634   push_to_sequence (*gen_seq);
18635   if (!maybe_expand_insn (icode, 6, ops))
18636     {
18637       end_sequence ();
18638       return NULL_RTX;
18639     }
18640
18641   *gen_seq = get_insns ();
18642   end_sequence ();
18643
18644   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
18645 }
18646
18647 #undef TARGET_GEN_CCMP_FIRST
18648 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
18649
18650 #undef TARGET_GEN_CCMP_NEXT
18651 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
18652
18653 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
18654    instruction fusion of some sort.  */
18655
18656 static bool
18657 aarch64_macro_fusion_p (void)
18658 {
18659   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
18660 }
18661
18662
18663 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
18664    should be kept together during scheduling.  */
18665
18666 static bool
18667 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
18668 {
18669   rtx set_dest;
18670   rtx prev_set = single_set (prev);
18671   rtx curr_set = single_set (curr);
18672   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
18673   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
18674
18675   if (!aarch64_macro_fusion_p ())
18676     return false;
18677
18678   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
18679     {
18680       /* We are trying to match:
18681          prev (mov)  == (set (reg r0) (const_int imm16))
18682          curr (movk) == (set (zero_extract (reg r0)
18683                                            (const_int 16)
18684                                            (const_int 16))
18685                              (const_int imm16_1))  */
18686
18687       set_dest = SET_DEST (curr_set);
18688
18689       if (GET_CODE (set_dest) == ZERO_EXTRACT
18690           && CONST_INT_P (SET_SRC (curr_set))
18691           && CONST_INT_P (SET_SRC (prev_set))
18692           && CONST_INT_P (XEXP (set_dest, 2))
18693           && INTVAL (XEXP (set_dest, 2)) == 16
18694           && REG_P (XEXP (set_dest, 0))
18695           && REG_P (SET_DEST (prev_set))
18696           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
18697         {
18698           return true;
18699         }
18700     }
18701
18702   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
18703     {
18704
18705       /*  We're trying to match:
18706           prev (adrp) == (set (reg r1)
18707                               (high (symbol_ref ("SYM"))))
18708           curr (add) == (set (reg r0)
18709                              (lo_sum (reg r1)
18710                                      (symbol_ref ("SYM"))))
18711           Note that r0 need not necessarily be the same as r1, especially
18712           during pre-regalloc scheduling.  */
18713
18714       if (satisfies_constraint_Ush (SET_SRC (prev_set))
18715           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
18716         {
18717           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
18718               && REG_P (XEXP (SET_SRC (curr_set), 0))
18719               && REGNO (XEXP (SET_SRC (curr_set), 0))
18720                  == REGNO (SET_DEST (prev_set))
18721               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
18722                               XEXP (SET_SRC (curr_set), 1)))
18723             return true;
18724         }
18725     }
18726
18727   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
18728     {
18729
18730       /* We're trying to match:
18731          prev (movk) == (set (zero_extract (reg r0)
18732                                            (const_int 16)
18733                                            (const_int 32))
18734                              (const_int imm16_1))
18735          curr (movk) == (set (zero_extract (reg r0)
18736                                            (const_int 16)
18737                                            (const_int 48))
18738                              (const_int imm16_2))  */
18739
18740       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
18741           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
18742           && REG_P (XEXP (SET_DEST (prev_set), 0))
18743           && REG_P (XEXP (SET_DEST (curr_set), 0))
18744           && REGNO (XEXP (SET_DEST (prev_set), 0))
18745              == REGNO (XEXP (SET_DEST (curr_set), 0))
18746           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
18747           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
18748           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
18749           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
18750           && CONST_INT_P (SET_SRC (prev_set))
18751           && CONST_INT_P (SET_SRC (curr_set)))
18752         return true;
18753
18754     }
18755   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
18756     {
18757       /* We're trying to match:
18758           prev (adrp) == (set (reg r0)
18759                               (high (symbol_ref ("SYM"))))
18760           curr (ldr) == (set (reg r1)
18761                              (mem (lo_sum (reg r0)
18762                                              (symbol_ref ("SYM")))))
18763                  or
18764           curr (ldr) == (set (reg r1)
18765                              (zero_extend (mem
18766                                            (lo_sum (reg r0)
18767                                                    (symbol_ref ("SYM"))))))  */
18768       if (satisfies_constraint_Ush (SET_SRC (prev_set))
18769           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
18770         {
18771           rtx curr_src = SET_SRC (curr_set);
18772
18773           if (GET_CODE (curr_src) == ZERO_EXTEND)
18774             curr_src = XEXP (curr_src, 0);
18775
18776           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
18777               && REG_P (XEXP (XEXP (curr_src, 0), 0))
18778               && REGNO (XEXP (XEXP (curr_src, 0), 0))
18779                  == REGNO (SET_DEST (prev_set))
18780               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
18781                               XEXP (SET_SRC (prev_set), 0)))
18782               return true;
18783         }
18784     }
18785
18786   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
18787       && any_condjump_p (curr))
18788     {
18789       unsigned int condreg1, condreg2;
18790       rtx cc_reg_1;
18791       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
18792       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
18793
18794       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
18795           && prev
18796           && modified_in_p (cc_reg_1, prev))
18797         {
18798           enum attr_type prev_type = get_attr_type (prev);
18799
18800           /* FIXME: this misses some which is considered simple arthematic
18801              instructions for ThunderX.  Simple shifts are missed here.  */
18802           if (prev_type == TYPE_ALUS_SREG
18803               || prev_type == TYPE_ALUS_IMM
18804               || prev_type == TYPE_LOGICS_REG
18805               || prev_type == TYPE_LOGICS_IMM)
18806             return true;
18807         }
18808     }
18809
18810   if (prev_set
18811       && curr_set
18812       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
18813       && any_condjump_p (curr))
18814     {
18815       /* We're trying to match:
18816           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
18817           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
18818                                                          (const_int 0))
18819                                                  (label_ref ("SYM"))
18820                                                  (pc))  */
18821       if (SET_DEST (curr_set) == (pc_rtx)
18822           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
18823           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
18824           && REG_P (SET_DEST (prev_set))
18825           && REGNO (SET_DEST (prev_set))
18826              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
18827         {
18828           /* Fuse ALU operations followed by conditional branch instruction.  */
18829           switch (get_attr_type (prev))
18830             {
18831             case TYPE_ALU_IMM:
18832             case TYPE_ALU_SREG:
18833             case TYPE_ADC_REG:
18834             case TYPE_ADC_IMM:
18835             case TYPE_ADCS_REG:
18836             case TYPE_ADCS_IMM:
18837             case TYPE_LOGIC_REG:
18838             case TYPE_LOGIC_IMM:
18839             case TYPE_CSEL:
18840             case TYPE_ADR:
18841             case TYPE_MOV_IMM:
18842             case TYPE_SHIFT_REG:
18843             case TYPE_SHIFT_IMM:
18844             case TYPE_BFM:
18845             case TYPE_RBIT:
18846             case TYPE_REV:
18847             case TYPE_EXTEND:
18848               return true;
18849
18850             default:;
18851             }
18852         }
18853     }
18854
18855   return false;
18856 }
18857
18858 /* Return true iff the instruction fusion described by OP is enabled.  */
18859
18860 bool
18861 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
18862 {
18863   return (aarch64_tune_params.fusible_ops & op) != 0;
18864 }
18865
18866 /* If MEM is in the form of [base+offset], extract the two parts
18867    of address and set to BASE and OFFSET, otherwise return false
18868    after clearing BASE and OFFSET.  */
18869
18870 bool
18871 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
18872 {
18873   rtx addr;
18874
18875   gcc_assert (MEM_P (mem));
18876
18877   addr = XEXP (mem, 0);
18878
18879   if (REG_P (addr))
18880     {
18881       *base = addr;
18882       *offset = const0_rtx;
18883       return true;
18884     }
18885
18886   if (GET_CODE (addr) == PLUS
18887       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
18888     {
18889       *base = XEXP (addr, 0);
18890       *offset = XEXP (addr, 1);
18891       return true;
18892     }
18893
18894   *base = NULL_RTX;
18895   *offset = NULL_RTX;
18896
18897   return false;
18898 }
18899
18900 /* Types for scheduling fusion.  */
18901 enum sched_fusion_type
18902 {
18903   SCHED_FUSION_NONE = 0,
18904   SCHED_FUSION_LD_SIGN_EXTEND,
18905   SCHED_FUSION_LD_ZERO_EXTEND,
18906   SCHED_FUSION_LD,
18907   SCHED_FUSION_ST,
18908   SCHED_FUSION_NUM
18909 };
18910
18911 /* If INSN is a load or store of address in the form of [base+offset],
18912    extract the two parts and set to BASE and OFFSET.  Return scheduling
18913    fusion type this INSN is.  */
18914
18915 static enum sched_fusion_type
18916 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
18917 {
18918   rtx x, dest, src;
18919   enum sched_fusion_type fusion = SCHED_FUSION_LD;
18920
18921   gcc_assert (INSN_P (insn));
18922   x = PATTERN (insn);
18923   if (GET_CODE (x) != SET)
18924     return SCHED_FUSION_NONE;
18925
18926   src = SET_SRC (x);
18927   dest = SET_DEST (x);
18928
18929   machine_mode dest_mode = GET_MODE (dest);
18930
18931   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
18932     return SCHED_FUSION_NONE;
18933
18934   if (GET_CODE (src) == SIGN_EXTEND)
18935     {
18936       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
18937       src = XEXP (src, 0);
18938       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
18939         return SCHED_FUSION_NONE;
18940     }
18941   else if (GET_CODE (src) == ZERO_EXTEND)
18942     {
18943       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
18944       src = XEXP (src, 0);
18945       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
18946         return SCHED_FUSION_NONE;
18947     }
18948
18949   if (GET_CODE (src) == MEM && REG_P (dest))
18950     extract_base_offset_in_addr (src, base, offset);
18951   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
18952     {
18953       fusion = SCHED_FUSION_ST;
18954       extract_base_offset_in_addr (dest, base, offset);
18955     }
18956   else
18957     return SCHED_FUSION_NONE;
18958
18959   if (*base == NULL_RTX || *offset == NULL_RTX)
18960     fusion = SCHED_FUSION_NONE;
18961
18962   return fusion;
18963 }
18964
18965 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
18966
18967    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
18968    and PRI are only calculated for these instructions.  For other instruction,
18969    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
18970    type instruction fusion can be added by returning different priorities.
18971
18972    It's important that irrelevant instructions get the largest FUSION_PRI.  */
18973
18974 static void
18975 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
18976                                int *fusion_pri, int *pri)
18977 {
18978   int tmp, off_val;
18979   rtx base, offset;
18980   enum sched_fusion_type fusion;
18981
18982   gcc_assert (INSN_P (insn));
18983
18984   tmp = max_pri - 1;
18985   fusion = fusion_load_store (insn, &base, &offset);
18986   if (fusion == SCHED_FUSION_NONE)
18987     {
18988       *pri = tmp;
18989       *fusion_pri = tmp;
18990       return;
18991     }
18992
18993   /* Set FUSION_PRI according to fusion type and base register.  */
18994   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
18995
18996   /* Calculate PRI.  */
18997   tmp /= 2;
18998
18999   /* INSN with smaller offset goes first.  */
19000   off_val = (int)(INTVAL (offset));
19001   if (off_val >= 0)
19002     tmp -= (off_val & 0xfffff);
19003   else
19004     tmp += ((- off_val) & 0xfffff);
19005
19006   *pri = tmp;
19007   return;
19008 }
19009
19010 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
19011    Adjust priority of sha1h instructions so they are scheduled before
19012    other SHA1 instructions.  */
19013
19014 static int
19015 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
19016 {
19017   rtx x = PATTERN (insn);
19018
19019   if (GET_CODE (x) == SET)
19020     {
19021       x = SET_SRC (x);
19022
19023       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
19024         return priority + 10;
19025     }
19026
19027   return priority;
19028 }
19029
19030 /* Given OPERANDS of consecutive load/store, check if we can merge
19031    them into ldp/stp.  LOAD is true if they are load instructions.
19032    MODE is the mode of memory operands.  */
19033
19034 bool
19035 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
19036                                 machine_mode mode)
19037 {
19038   HOST_WIDE_INT offval_1, offval_2, msize;
19039   enum reg_class rclass_1, rclass_2;
19040   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
19041
19042   if (load)
19043     {
19044       mem_1 = operands[1];
19045       mem_2 = operands[3];
19046       reg_1 = operands[0];
19047       reg_2 = operands[2];
19048       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
19049       if (REGNO (reg_1) == REGNO (reg_2))
19050         return false;
19051     }
19052   else
19053     {
19054       mem_1 = operands[0];
19055       mem_2 = operands[2];
19056       reg_1 = operands[1];
19057       reg_2 = operands[3];
19058     }
19059
19060   /* The mems cannot be volatile.  */
19061   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
19062     return false;
19063
19064   /* If we have SImode and slow unaligned ldp,
19065      check the alignment to be at least 8 byte. */
19066   if (mode == SImode
19067       && (aarch64_tune_params.extra_tuning_flags
19068           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19069       && !optimize_size
19070       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
19071     return false;
19072
19073   /* Check if the addresses are in the form of [base+offset].  */
19074   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
19075   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
19076     return false;
19077   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
19078   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
19079     return false;
19080
19081   /* Check if the bases are same.  */
19082   if (!rtx_equal_p (base_1, base_2))
19083     return false;
19084
19085   /* The operands must be of the same size.  */
19086   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
19087                          GET_MODE_SIZE (GET_MODE (mem_2))));
19088
19089   offval_1 = INTVAL (offset_1);
19090   offval_2 = INTVAL (offset_2);
19091   /* We should only be trying this for fixed-sized modes.  There is no
19092      SVE LDP/STP instruction.  */
19093   msize = GET_MODE_SIZE (mode).to_constant ();
19094   /* Check if the offsets are consecutive.  */
19095   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
19096     return false;
19097
19098   /* Check if the addresses are clobbered by load.  */
19099   if (load)
19100     {
19101       if (reg_mentioned_p (reg_1, mem_1))
19102         return false;
19103
19104       /* In increasing order, the last load can clobber the address.  */
19105       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
19106         return false;
19107     }
19108
19109   /* One of the memory accesses must be a mempair operand.
19110      If it is not the first one, they need to be swapped by the
19111      peephole.  */
19112   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
19113        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
19114     return false;
19115
19116   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
19117     rclass_1 = FP_REGS;
19118   else
19119     rclass_1 = GENERAL_REGS;
19120
19121   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
19122     rclass_2 = FP_REGS;
19123   else
19124     rclass_2 = GENERAL_REGS;
19125
19126   /* Check if the registers are of same class.  */
19127   if (rclass_1 != rclass_2)
19128     return false;
19129
19130   return true;
19131 }
19132
19133 /* Given OPERANDS of consecutive load/store that can be merged,
19134    swap them if they are not in ascending order.  */
19135 void
19136 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
19137 {
19138   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
19139   HOST_WIDE_INT offval_1, offval_2;
19140
19141   if (load)
19142     {
19143       mem_1 = operands[1];
19144       mem_2 = operands[3];
19145     }
19146   else
19147     {
19148       mem_1 = operands[0];
19149       mem_2 = operands[2];
19150     }
19151
19152   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
19153   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
19154
19155   offval_1 = INTVAL (offset_1);
19156   offval_2 = INTVAL (offset_2);
19157
19158   if (offval_1 > offval_2)
19159     {
19160       /* Irrespective of whether this is a load or a store,
19161          we do the same swap.  */
19162       std::swap (operands[0], operands[2]);
19163       std::swap (operands[1], operands[3]);
19164     }
19165 }
19166
19167 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
19168    comparison between the two.  */
19169 int
19170 aarch64_host_wide_int_compare (const void *x, const void *y)
19171 {
19172   return wi::cmps (* ((const HOST_WIDE_INT *) x),
19173                    * ((const HOST_WIDE_INT *) y));
19174 }
19175
19176 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
19177    other pointing to a REG rtx containing an offset, compare the offsets
19178    of the two pairs.
19179
19180    Return:
19181
19182         1 iff offset (X) > offset (Y)
19183         0 iff offset (X) == offset (Y)
19184         -1 iff offset (X) < offset (Y)  */
19185 int
19186 aarch64_ldrstr_offset_compare (const void *x, const void *y)
19187 {
19188   const rtx * operands_1 = (const rtx *) x;
19189   const rtx * operands_2 = (const rtx *) y;
19190   rtx mem_1, mem_2, base, offset_1, offset_2;
19191
19192   if (MEM_P (operands_1[0]))
19193     mem_1 = operands_1[0];
19194   else
19195     mem_1 = operands_1[1];
19196
19197   if (MEM_P (operands_2[0]))
19198     mem_2 = operands_2[0];
19199   else
19200     mem_2 = operands_2[1];
19201
19202   /* Extract the offsets.  */
19203   extract_base_offset_in_addr (mem_1, &base, &offset_1);
19204   extract_base_offset_in_addr (mem_2, &base, &offset_2);
19205
19206   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
19207
19208   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
19209 }
19210
19211 /* Given OPERANDS of consecutive load/store, check if we can merge
19212    them into ldp/stp by adjusting the offset.  LOAD is true if they
19213    are load instructions.  MODE is the mode of memory operands.
19214
19215    Given below consecutive stores:
19216
19217      str  w1, [xb, 0x100]
19218      str  w1, [xb, 0x104]
19219      str  w1, [xb, 0x108]
19220      str  w1, [xb, 0x10c]
19221
19222    Though the offsets are out of the range supported by stp, we can
19223    still pair them after adjusting the offset, like:
19224
19225      add  scratch, xb, 0x100
19226      stp  w1, w1, [scratch]
19227      stp  w1, w1, [scratch, 0x8]
19228
19229    The peephole patterns detecting this opportunity should guarantee
19230    the scratch register is avaliable.  */
19231
19232 bool
19233 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
19234                                        scalar_mode mode)
19235 {
19236   const int num_insns = 4;
19237   enum reg_class rclass;
19238   HOST_WIDE_INT offvals[num_insns], msize;
19239   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
19240
19241   if (load)
19242     {
19243       for (int i = 0; i < num_insns; i++)
19244         {
19245           reg[i] = operands[2 * i];
19246           mem[i] = operands[2 * i + 1];
19247
19248           gcc_assert (REG_P (reg[i]));
19249         }
19250
19251       /* Do not attempt to merge the loads if the loads clobber each other.  */
19252       for (int i = 0; i < 8; i += 2)
19253         for (int j = i + 2; j < 8; j += 2)
19254           if (reg_overlap_mentioned_p (operands[i], operands[j]))
19255             return false;
19256     }
19257   else
19258     for (int i = 0; i < num_insns; i++)
19259       {
19260         mem[i] = operands[2 * i];
19261         reg[i] = operands[2 * i + 1];
19262       }
19263
19264   /* Skip if memory operand is by itself valid for ldp/stp.  */
19265   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
19266     return false;
19267
19268   for (int i = 0; i < num_insns; i++)
19269     {
19270       /* The mems cannot be volatile.  */
19271       if (MEM_VOLATILE_P (mem[i]))
19272         return false;
19273
19274       /* Check if the addresses are in the form of [base+offset].  */
19275       extract_base_offset_in_addr (mem[i], base + i, offset + i);
19276       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
19277         return false;
19278     }
19279
19280   /* Check if the registers are of same class.  */
19281   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
19282     ? FP_REGS : GENERAL_REGS;
19283
19284   for (int i = 1; i < num_insns; i++)
19285     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
19286       {
19287         if (rclass != FP_REGS)
19288           return false;
19289       }
19290     else
19291       {
19292         if (rclass != GENERAL_REGS)
19293           return false;
19294       }
19295
19296   /* Only the last register in the order in which they occur
19297      may be clobbered by the load.  */
19298   if (rclass == GENERAL_REGS && load)
19299     for (int i = 0; i < num_insns - 1; i++)
19300       if (reg_mentioned_p (reg[i], mem[i]))
19301         return false;
19302
19303   /* Check if the bases are same.  */
19304   for (int i = 0; i < num_insns - 1; i++)
19305     if (!rtx_equal_p (base[i], base[i + 1]))
19306       return false;
19307
19308   for (int i = 0; i < num_insns; i++)
19309     offvals[i] = INTVAL (offset[i]);
19310
19311   msize = GET_MODE_SIZE (mode);
19312
19313   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
19314   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
19315          aarch64_host_wide_int_compare);
19316
19317   if (!(offvals[1] == offvals[0] + msize
19318         && offvals[3] == offvals[2] + msize))
19319     return false;
19320
19321   /* Check that offsets are within range of each other.  The ldp/stp
19322      instructions have 7 bit immediate offsets, so use 0x80.  */
19323   if (offvals[2] - offvals[0] >= msize * 0x80)
19324     return false;
19325
19326   /* The offsets must be aligned with respect to each other.  */
19327   if (offvals[0] % msize != offvals[2] % msize)
19328     return false;
19329
19330   /* If we have SImode and slow unaligned ldp,
19331      check the alignment to be at least 8 byte. */
19332   if (mode == SImode
19333       && (aarch64_tune_params.extra_tuning_flags
19334           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19335       && !optimize_size
19336       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
19337     return false;
19338
19339   return true;
19340 }
19341
19342 /* Given OPERANDS of consecutive load/store, this function pairs them
19343    into LDP/STP after adjusting the offset.  It depends on the fact
19344    that the operands can be sorted so the offsets are correct for STP.
19345    MODE is the mode of memory operands.  CODE is the rtl operator
19346    which should be applied to all memory operands, it's SIGN_EXTEND,
19347    ZERO_EXTEND or UNKNOWN.  */
19348
19349 bool
19350 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
19351                              scalar_mode mode, RTX_CODE code)
19352 {
19353   rtx base, offset_1, offset_3, t1, t2;
19354   rtx mem_1, mem_2, mem_3, mem_4;
19355   rtx temp_operands[8];
19356   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
19357                 stp_off_upper_limit, stp_off_lower_limit, msize;
19358
19359   /* We make changes on a copy as we may still bail out.  */
19360   for (int i = 0; i < 8; i ++)
19361     temp_operands[i] = operands[i];
19362
19363   /* Sort the operands.  */
19364   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
19365
19366   /* Copy the memory operands so that if we have to bail for some
19367      reason the original addresses are unchanged.  */
19368   if (load)
19369     {
19370       mem_1 = copy_rtx (temp_operands[1]);
19371       mem_2 = copy_rtx (temp_operands[3]);
19372       mem_3 = copy_rtx (temp_operands[5]);
19373       mem_4 = copy_rtx (temp_operands[7]);
19374     }
19375   else
19376     {
19377       mem_1 = copy_rtx (temp_operands[0]);
19378       mem_2 = copy_rtx (temp_operands[2]);
19379       mem_3 = copy_rtx (temp_operands[4]);
19380       mem_4 = copy_rtx (temp_operands[6]);
19381       gcc_assert (code == UNKNOWN);
19382     }
19383
19384   extract_base_offset_in_addr (mem_1, &base, &offset_1);
19385   extract_base_offset_in_addr (mem_3, &base, &offset_3);
19386   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
19387               && offset_3 != NULL_RTX);
19388
19389   /* Adjust offset so it can fit in LDP/STP instruction.  */
19390   msize = GET_MODE_SIZE (mode);
19391   stp_off_upper_limit = msize * (0x40 - 1);
19392   stp_off_lower_limit = - msize * 0x40;
19393
19394   off_val_1 = INTVAL (offset_1);
19395   off_val_3 = INTVAL (offset_3);
19396
19397   /* The base offset is optimally half way between the two STP/LDP offsets.  */
19398   if (msize <= 4)
19399     base_off = (off_val_1 + off_val_3) / 2;
19400   else
19401     /* However, due to issues with negative LDP/STP offset generation for
19402        larger modes, for DF, DI and vector modes. we must not use negative
19403        addresses smaller than 9 signed unadjusted bits can store.  This
19404        provides the most range in this case.  */
19405     base_off = off_val_1;
19406
19407   /* Adjust the base so that it is aligned with the addresses but still
19408      optimal.  */
19409   if (base_off % msize != off_val_1 % msize)
19410     /* Fix the offset, bearing in mind we want to make it bigger not
19411        smaller.  */
19412     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19413   else if (msize <= 4)
19414     /* The negative range of LDP/STP is one larger than the positive range.  */
19415     base_off += msize;
19416
19417   /* Check if base offset is too big or too small.  We can attempt to resolve
19418      this issue by setting it to the maximum value and seeing if the offsets
19419      still fit.  */
19420   if (base_off >= 0x1000)
19421     {
19422       base_off = 0x1000 - 1;
19423       /* We must still make sure that the base offset is aligned with respect
19424          to the address.  But it may may not be made any bigger.  */
19425       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19426     }
19427
19428   /* Likewise for the case where the base is too small.  */
19429   if (base_off <= -0x1000)
19430     {
19431       base_off = -0x1000 + 1;
19432       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19433     }
19434
19435   /* Offset of the first STP/LDP.  */
19436   new_off_1 = off_val_1 - base_off;
19437
19438   /* Offset of the second STP/LDP.  */
19439   new_off_3 = off_val_3 - base_off;
19440
19441   /* The offsets must be within the range of the LDP/STP instructions.  */
19442   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
19443       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
19444     return false;
19445
19446   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
19447                                                   new_off_1), true);
19448   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
19449                                                   new_off_1 + msize), true);
19450   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
19451                                                   new_off_3), true);
19452   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
19453                                                   new_off_3 + msize), true);
19454
19455   if (!aarch64_mem_pair_operand (mem_1, mode)
19456       || !aarch64_mem_pair_operand (mem_3, mode))
19457     return false;
19458
19459   if (code == ZERO_EXTEND)
19460     {
19461       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
19462       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
19463       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
19464       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
19465     }
19466   else if (code == SIGN_EXTEND)
19467     {
19468       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
19469       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
19470       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
19471       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
19472     }
19473
19474   if (load)
19475     {
19476       operands[0] = temp_operands[0];
19477       operands[1] = mem_1;
19478       operands[2] = temp_operands[2];
19479       operands[3] = mem_2;
19480       operands[4] = temp_operands[4];
19481       operands[5] = mem_3;
19482       operands[6] = temp_operands[6];
19483       operands[7] = mem_4;
19484     }
19485   else
19486     {
19487       operands[0] = mem_1;
19488       operands[1] = temp_operands[1];
19489       operands[2] = mem_2;
19490       operands[3] = temp_operands[3];
19491       operands[4] = mem_3;
19492       operands[5] = temp_operands[5];
19493       operands[6] = mem_4;
19494       operands[7] = temp_operands[7];
19495     }
19496
19497   /* Emit adjusting instruction.  */
19498   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
19499   /* Emit ldp/stp instructions.  */
19500   t1 = gen_rtx_SET (operands[0], operands[1]);
19501   t2 = gen_rtx_SET (operands[2], operands[3]);
19502   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19503   t1 = gen_rtx_SET (operands[4], operands[5]);
19504   t2 = gen_rtx_SET (operands[6], operands[7]);
19505   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19506   return true;
19507 }
19508
19509 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
19510    it isn't worth branching around empty masked ops (including masked
19511    stores).  */
19512
19513 static bool
19514 aarch64_empty_mask_is_expensive (unsigned)
19515 {
19516   return false;
19517 }
19518
19519 /* Return 1 if pseudo register should be created and used to hold
19520    GOT address for PIC code.  */
19521
19522 bool
19523 aarch64_use_pseudo_pic_reg (void)
19524 {
19525   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
19526 }
19527
19528 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
19529
19530 static int
19531 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
19532 {
19533   switch (XINT (x, 1))
19534     {
19535     case UNSPEC_GOTSMALLPIC:
19536     case UNSPEC_GOTSMALLPIC28K:
19537     case UNSPEC_GOTTINYPIC:
19538       return 0;
19539     default:
19540       break;
19541     }
19542
19543   return default_unspec_may_trap_p (x, flags);
19544 }
19545
19546
19547 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
19548    return the log2 of that value.  Otherwise return -1.  */
19549
19550 int
19551 aarch64_fpconst_pow_of_2 (rtx x)
19552 {
19553   const REAL_VALUE_TYPE *r;
19554
19555   if (!CONST_DOUBLE_P (x))
19556     return -1;
19557
19558   r = CONST_DOUBLE_REAL_VALUE (x);
19559
19560   if (REAL_VALUE_NEGATIVE (*r)
19561       || REAL_VALUE_ISNAN (*r)
19562       || REAL_VALUE_ISINF (*r)
19563       || !real_isinteger (r, DFmode))
19564     return -1;
19565
19566   return exact_log2 (real_to_integer (r));
19567 }
19568
19569 /* If X is a vector of equal CONST_DOUBLE values and that value is
19570    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
19571
19572 int
19573 aarch64_vec_fpconst_pow_of_2 (rtx x)
19574 {
19575   int nelts;
19576   if (GET_CODE (x) != CONST_VECTOR
19577       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
19578     return -1;
19579
19580   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
19581     return -1;
19582
19583   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
19584   if (firstval <= 0)
19585     return -1;
19586
19587   for (int i = 1; i < nelts; i++)
19588     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
19589       return -1;
19590
19591   return firstval;
19592 }
19593
19594 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
19595    to float.
19596
19597    __fp16 always promotes through this hook.
19598    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
19599    through the generic excess precision logic rather than here.  */
19600
19601 static tree
19602 aarch64_promoted_type (const_tree t)
19603 {
19604   if (SCALAR_FLOAT_TYPE_P (t)
19605       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
19606     return float_type_node;
19607
19608   return NULL_TREE;
19609 }
19610
19611 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
19612
19613 static bool
19614 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
19615                            optimization_type opt_type)
19616 {
19617   switch (op)
19618     {
19619     case rsqrt_optab:
19620       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
19621
19622     default:
19623       return true;
19624     }
19625 }
19626
19627 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
19628
19629 static unsigned int
19630 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
19631                                         int *offset)
19632 {
19633   /* Polynomial invariant 1 == (VG / 2) - 1.  */
19634   gcc_assert (i == 1);
19635   *factor = 2;
19636   *offset = 1;
19637   return AARCH64_DWARF_VG;
19638 }
19639
19640 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
19641    if MODE is HFmode, and punt to the generic implementation otherwise.  */
19642
19643 static bool
19644 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
19645 {
19646   return (mode == HFmode
19647           ? true
19648           : default_libgcc_floating_mode_supported_p (mode));
19649 }
19650
19651 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
19652    if MODE is HFmode, and punt to the generic implementation otherwise.  */
19653
19654 static bool
19655 aarch64_scalar_mode_supported_p (scalar_mode mode)
19656 {
19657   return (mode == HFmode
19658           ? true
19659           : default_scalar_mode_supported_p (mode));
19660 }
19661
19662 /* Set the value of FLT_EVAL_METHOD.
19663    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
19664
19665     0: evaluate all operations and constants, whose semantic type has at
19666        most the range and precision of type float, to the range and
19667        precision of float; evaluate all other operations and constants to
19668        the range and precision of the semantic type;
19669
19670     N, where _FloatN is a supported interchange floating type
19671        evaluate all operations and constants, whose semantic type has at
19672        most the range and precision of _FloatN type, to the range and
19673        precision of the _FloatN type; evaluate all other operations and
19674        constants to the range and precision of the semantic type;
19675
19676    If we have the ARMv8.2-A extensions then we support _Float16 in native
19677    precision, so we should set this to 16.  Otherwise, we support the type,
19678    but want to evaluate expressions in float precision, so set this to
19679    0.  */
19680
19681 static enum flt_eval_method
19682 aarch64_excess_precision (enum excess_precision_type type)
19683 {
19684   switch (type)
19685     {
19686       case EXCESS_PRECISION_TYPE_FAST:
19687       case EXCESS_PRECISION_TYPE_STANDARD:
19688         /* We can calculate either in 16-bit range and precision or
19689            32-bit range and precision.  Make that decision based on whether
19690            we have native support for the ARMv8.2-A 16-bit floating-point
19691            instructions or not.  */
19692         return (TARGET_FP_F16INST
19693                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
19694                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
19695       case EXCESS_PRECISION_TYPE_IMPLICIT:
19696         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
19697       default:
19698         gcc_unreachable ();
19699     }
19700   return FLT_EVAL_METHOD_UNPREDICTABLE;
19701 }
19702
19703 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
19704    scheduled for speculative execution.  Reject the long-running division
19705    and square-root instructions.  */
19706
19707 static bool
19708 aarch64_sched_can_speculate_insn (rtx_insn *insn)
19709 {
19710   switch (get_attr_type (insn))
19711     {
19712       case TYPE_SDIV:
19713       case TYPE_UDIV:
19714       case TYPE_FDIVS:
19715       case TYPE_FDIVD:
19716       case TYPE_FSQRTS:
19717       case TYPE_FSQRTD:
19718       case TYPE_NEON_FP_SQRT_S:
19719       case TYPE_NEON_FP_SQRT_D:
19720       case TYPE_NEON_FP_SQRT_S_Q:
19721       case TYPE_NEON_FP_SQRT_D_Q:
19722       case TYPE_NEON_FP_DIV_S:
19723       case TYPE_NEON_FP_DIV_D:
19724       case TYPE_NEON_FP_DIV_S_Q:
19725       case TYPE_NEON_FP_DIV_D_Q:
19726         return false;
19727       default:
19728         return true;
19729     }
19730 }
19731
19732 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
19733
19734 static int
19735 aarch64_compute_pressure_classes (reg_class *classes)
19736 {
19737   int i = 0;
19738   classes[i++] = GENERAL_REGS;
19739   classes[i++] = FP_REGS;
19740   /* PR_REGS isn't a useful pressure class because many predicate pseudo
19741      registers need to go in PR_LO_REGS at some point during their
19742      lifetime.  Splitting it into two halves has the effect of making
19743      all predicates count against PR_LO_REGS, so that we try whenever
19744      possible to restrict the number of live predicates to 8.  This
19745      greatly reduces the amount of spilling in certain loops.  */
19746   classes[i++] = PR_LO_REGS;
19747   classes[i++] = PR_HI_REGS;
19748   return i;
19749 }
19750
19751 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
19752
19753 static bool
19754 aarch64_can_change_mode_class (machine_mode from,
19755                                machine_mode to, reg_class_t)
19756 {
19757   if (BYTES_BIG_ENDIAN)
19758     {
19759       bool from_sve_p = aarch64_sve_data_mode_p (from);
19760       bool to_sve_p = aarch64_sve_data_mode_p (to);
19761
19762       /* Don't allow changes between SVE data modes and non-SVE modes.
19763          See the comment at the head of aarch64-sve.md for details.  */
19764       if (from_sve_p != to_sve_p)
19765         return false;
19766
19767       /* Don't allow changes in element size: lane 0 of the new vector
19768          would not then be lane 0 of the old vector.  See the comment
19769          above aarch64_maybe_expand_sve_subreg_move for a more detailed
19770          description.
19771
19772          In the worst case, this forces a register to be spilled in
19773          one mode and reloaded in the other, which handles the
19774          endianness correctly.  */
19775       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
19776         return false;
19777     }
19778   return true;
19779 }
19780
19781 /* Implement TARGET_EARLY_REMAT_MODES.  */
19782
19783 static void
19784 aarch64_select_early_remat_modes (sbitmap modes)
19785 {
19786   /* SVE values are not normally live across a call, so it should be
19787      worth doing early rematerialization even in VL-specific mode.  */
19788   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
19789     {
19790       machine_mode mode = (machine_mode) i;
19791       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
19792       if (vec_flags & VEC_ANY_SVE)
19793         bitmap_set_bit (modes, i);
19794     }
19795 }
19796
19797 /* Override the default target speculation_safe_value.  */
19798 static rtx
19799 aarch64_speculation_safe_value (machine_mode mode,
19800                                 rtx result, rtx val, rtx failval)
19801 {
19802   /* Maybe we should warn if falling back to hard barriers.  They are
19803      likely to be noticably more expensive than the alternative below.  */
19804   if (!aarch64_track_speculation)
19805     return default_speculation_safe_value (mode, result, val, failval);
19806
19807   if (!REG_P (val))
19808     val = copy_to_mode_reg (mode, val);
19809
19810   if (!aarch64_reg_or_zero (failval, mode))
19811     failval = copy_to_mode_reg (mode, failval);
19812
19813   emit_insn (gen_despeculate_copy (mode, result, val, failval));
19814   return result;
19815 }
19816
19817 /* Implement TARGET_ESTIMATED_POLY_VALUE.
19818    Look into the tuning structure for an estimate.
19819    VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
19820    Advanced SIMD 128 bits.  */
19821
19822 static HOST_WIDE_INT
19823 aarch64_estimated_poly_value (poly_int64 val)
19824 {
19825   enum aarch64_sve_vector_bits_enum width_source
19826     = aarch64_tune_params.sve_width;
19827
19828   /* If we still don't have an estimate, use the default.  */
19829   if (width_source == SVE_SCALABLE)
19830     return default_estimated_poly_value (val);
19831
19832   HOST_WIDE_INT over_128 = width_source - 128;
19833   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
19834 }
19835
19836
19837 /* Return true for types that could be supported as SIMD return or
19838    argument types.  */
19839
19840 static bool
19841 supported_simd_type (tree t)
19842 {
19843   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
19844     {
19845       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
19846       return s == 1 || s == 2 || s == 4 || s == 8;
19847     }
19848   return false;
19849 }
19850
19851 /* Return true for types that currently are supported as SIMD return
19852    or argument types.  */
19853
19854 static bool
19855 currently_supported_simd_type (tree t, tree b)
19856 {
19857   if (COMPLEX_FLOAT_TYPE_P (t))
19858     return false;
19859
19860   if (TYPE_SIZE (t) != TYPE_SIZE (b))
19861     return false;
19862
19863   return supported_simd_type (t);
19864 }
19865
19866 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
19867
19868 static int
19869 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
19870                                         struct cgraph_simd_clone *clonei,
19871                                         tree base_type, int num)
19872 {
19873   tree t, ret_type, arg_type;
19874   unsigned int elt_bits, vec_bits, count;
19875
19876   if (!TARGET_SIMD)
19877     return 0;
19878
19879   if (clonei->simdlen
19880       && (clonei->simdlen < 2
19881           || clonei->simdlen > 1024
19882           || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
19883     {
19884       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19885                   "unsupported simdlen %d", clonei->simdlen);
19886       return 0;
19887     }
19888
19889   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
19890   if (TREE_CODE (ret_type) != VOID_TYPE
19891       && !currently_supported_simd_type (ret_type, base_type))
19892     {
19893       if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
19894         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19895                     "GCC does not currently support mixed size types "
19896                     "for %<simd%> functions");
19897       else if (supported_simd_type (ret_type))
19898         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19899                     "GCC does not currently support return type %qT "
19900                     "for %<simd%> functions", ret_type);
19901       else
19902         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19903                     "unsupported return type %qT for %<simd%> functions",
19904                     ret_type);
19905       return 0;
19906     }
19907
19908   for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
19909     {
19910       arg_type = TREE_TYPE (t);
19911
19912       if (!currently_supported_simd_type (arg_type, base_type))
19913         {
19914           if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
19915             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19916                         "GCC does not currently support mixed size types "
19917                         "for %<simd%> functions");
19918           else
19919             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19920                         "GCC does not currently support argument type %qT "
19921                         "for %<simd%> functions", arg_type);
19922           return 0;
19923         }
19924     }
19925
19926   clonei->vecsize_mangle = 'n';
19927   clonei->mask_mode = VOIDmode;
19928   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
19929   if (clonei->simdlen == 0)
19930     {
19931       count = 2;
19932       vec_bits = (num == 0 ? 64 : 128);
19933       clonei->simdlen = vec_bits / elt_bits;
19934     }
19935   else
19936     {
19937       count = 1;
19938       vec_bits = clonei->simdlen * elt_bits;
19939       if (vec_bits != 64 && vec_bits != 128)
19940         {
19941           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19942                       "GCC does not currently support simdlen %d for type %qT",
19943                       clonei->simdlen, base_type);
19944           return 0;
19945         }
19946     }
19947   clonei->vecsize_int = vec_bits;
19948   clonei->vecsize_float = vec_bits;
19949   return count;
19950 }
19951
19952 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
19953
19954 static void
19955 aarch64_simd_clone_adjust (struct cgraph_node *node)
19956 {
19957   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
19958      use the correct ABI.  */
19959
19960   tree t = TREE_TYPE (node->decl);
19961   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
19962                                         TYPE_ATTRIBUTES (t));
19963 }
19964
19965 /* Implement TARGET_SIMD_CLONE_USABLE.  */
19966
19967 static int
19968 aarch64_simd_clone_usable (struct cgraph_node *node)
19969 {
19970   switch (node->simdclone->vecsize_mangle)
19971     {
19972     case 'n':
19973       if (!TARGET_SIMD)
19974         return -1;
19975       return 0;
19976     default:
19977       gcc_unreachable ();
19978     }
19979 }
19980
19981 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
19982
19983 static int
19984 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
19985 {
19986   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
19987       != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
19988     return 0;
19989   return 1;
19990 }
19991
19992 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
19993
19994 static const char *
19995 aarch64_get_multilib_abi_name (void)
19996 {
19997   if (TARGET_BIG_END)
19998     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
19999   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
20000 }
20001
20002 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
20003    global variable based guard use the default else
20004    return a null tree.  */
20005 static tree
20006 aarch64_stack_protect_guard (void)
20007 {
20008   if (aarch64_stack_protector_guard == SSP_GLOBAL)
20009     return default_stack_protect_guard ();
20010
20011   return NULL_TREE;
20012 }
20013
20014 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
20015    section at the end if needed.  */
20016 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND      0xc0000000
20017 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI      (1U << 0)
20018 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC      (1U << 1)
20019 void
20020 aarch64_file_end_indicate_exec_stack ()
20021 {
20022   file_end_indicate_exec_stack ();
20023
20024   unsigned feature_1_and = 0;
20025   if (aarch64_bti_enabled ())
20026     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
20027
20028   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
20029     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
20030
20031   if (feature_1_and)
20032     {
20033       /* Generate .note.gnu.property section.  */
20034       switch_to_section (get_section (".note.gnu.property",
20035                                       SECTION_NOTYPE, NULL));
20036
20037       /* PT_NOTE header: namesz, descsz, type.
20038          namesz = 4 ("GNU\0")
20039          descsz = 16 (Size of the program property array)
20040                   [(12 + padding) * Number of array elements]
20041          type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
20042       assemble_align (POINTER_SIZE);
20043       assemble_integer (GEN_INT (4), 4, 32, 1);
20044       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
20045       assemble_integer (GEN_INT (5), 4, 32, 1);
20046
20047       /* PT_NOTE name.  */
20048       assemble_string ("GNU", 4);
20049
20050       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
20051          type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
20052          datasz = 4
20053          data   = feature_1_and.  */
20054       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
20055       assemble_integer (GEN_INT (4), 4, 32, 1);
20056       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
20057
20058       /* Pad the size of the note to the required alignment.  */
20059       assemble_align (POINTER_SIZE);
20060     }
20061 }
20062 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
20063 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
20064 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
20065
20066 /* Target-specific selftests.  */
20067
20068 #if CHECKING_P
20069
20070 namespace selftest {
20071
20072 /* Selftest for the RTL loader.
20073    Verify that the RTL loader copes with a dump from
20074    print_rtx_function.  This is essentially just a test that class
20075    function_reader can handle a real dump, but it also verifies
20076    that lookup_reg_by_dump_name correctly handles hard regs.
20077    The presence of hard reg names in the dump means that the test is
20078    target-specific, hence it is in this file.  */
20079
20080 static void
20081 aarch64_test_loading_full_dump ()
20082 {
20083   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
20084
20085   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
20086
20087   rtx_insn *insn_1 = get_insn_by_uid (1);
20088   ASSERT_EQ (NOTE, GET_CODE (insn_1));
20089
20090   rtx_insn *insn_15 = get_insn_by_uid (15);
20091   ASSERT_EQ (INSN, GET_CODE (insn_15));
20092   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
20093
20094   /* Verify crtl->return_rtx.  */
20095   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
20096   ASSERT_EQ (0, REGNO (crtl->return_rtx));
20097   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
20098 }
20099
20100 /* Run all target-specific selftests.  */
20101
20102 static void
20103 aarch64_run_selftests (void)
20104 {
20105   aarch64_test_loading_full_dump ();
20106 }
20107
20108 } // namespace selftest
20109
20110 #endif /* #if CHECKING_P */
20111
20112 #undef TARGET_STACK_PROTECT_GUARD
20113 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
20114
20115 #undef TARGET_ADDRESS_COST
20116 #define TARGET_ADDRESS_COST aarch64_address_cost
20117
20118 /* This hook will determines whether unnamed bitfields affect the alignment
20119    of the containing structure.  The hook returns true if the structure
20120    should inherit the alignment requirements of an unnamed bitfield's
20121    type.  */
20122 #undef TARGET_ALIGN_ANON_BITFIELD
20123 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
20124
20125 #undef TARGET_ASM_ALIGNED_DI_OP
20126 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
20127
20128 #undef TARGET_ASM_ALIGNED_HI_OP
20129 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
20130
20131 #undef TARGET_ASM_ALIGNED_SI_OP
20132 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
20133
20134 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
20135 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
20136   hook_bool_const_tree_hwi_hwi_const_tree_true
20137
20138 #undef TARGET_ASM_FILE_START
20139 #define TARGET_ASM_FILE_START aarch64_start_file
20140
20141 #undef TARGET_ASM_OUTPUT_MI_THUNK
20142 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
20143
20144 #undef TARGET_ASM_SELECT_RTX_SECTION
20145 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
20146
20147 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
20148 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
20149
20150 #undef TARGET_BUILD_BUILTIN_VA_LIST
20151 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
20152
20153 #undef TARGET_CALLEE_COPIES
20154 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
20155
20156 #undef TARGET_CAN_ELIMINATE
20157 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
20158
20159 #undef TARGET_CAN_INLINE_P
20160 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
20161
20162 #undef TARGET_CANNOT_FORCE_CONST_MEM
20163 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
20164
20165 #undef TARGET_CASE_VALUES_THRESHOLD
20166 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
20167
20168 #undef TARGET_CONDITIONAL_REGISTER_USAGE
20169 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
20170
20171 /* Only the least significant bit is used for initialization guard
20172    variables.  */
20173 #undef TARGET_CXX_GUARD_MASK_BIT
20174 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
20175
20176 #undef TARGET_C_MODE_FOR_SUFFIX
20177 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
20178
20179 #ifdef TARGET_BIG_ENDIAN_DEFAULT
20180 #undef  TARGET_DEFAULT_TARGET_FLAGS
20181 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
20182 #endif
20183
20184 #undef TARGET_CLASS_MAX_NREGS
20185 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
20186
20187 #undef TARGET_BUILTIN_DECL
20188 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
20189
20190 #undef TARGET_BUILTIN_RECIPROCAL
20191 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
20192
20193 #undef TARGET_C_EXCESS_PRECISION
20194 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
20195
20196 #undef  TARGET_EXPAND_BUILTIN
20197 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
20198
20199 #undef TARGET_EXPAND_BUILTIN_VA_START
20200 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
20201
20202 #undef TARGET_FOLD_BUILTIN
20203 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
20204
20205 #undef TARGET_FUNCTION_ARG
20206 #define TARGET_FUNCTION_ARG aarch64_function_arg
20207
20208 #undef TARGET_FUNCTION_ARG_ADVANCE
20209 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
20210
20211 #undef TARGET_FUNCTION_ARG_BOUNDARY
20212 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
20213
20214 #undef TARGET_FUNCTION_ARG_PADDING
20215 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
20216
20217 #undef TARGET_GET_RAW_RESULT_MODE
20218 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
20219 #undef TARGET_GET_RAW_ARG_MODE
20220 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
20221
20222 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
20223 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
20224
20225 #undef TARGET_FUNCTION_VALUE
20226 #define TARGET_FUNCTION_VALUE aarch64_function_value
20227
20228 #undef TARGET_FUNCTION_VALUE_REGNO_P
20229 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
20230
20231 #undef TARGET_GIMPLE_FOLD_BUILTIN
20232 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
20233
20234 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
20235 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
20236
20237 #undef  TARGET_INIT_BUILTINS
20238 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
20239
20240 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
20241 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
20242   aarch64_ira_change_pseudo_allocno_class
20243
20244 #undef TARGET_LEGITIMATE_ADDRESS_P
20245 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
20246
20247 #undef TARGET_LEGITIMATE_CONSTANT_P
20248 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
20249
20250 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
20251 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
20252   aarch64_legitimize_address_displacement
20253
20254 #undef TARGET_LIBGCC_CMP_RETURN_MODE
20255 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
20256
20257 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
20258 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
20259 aarch64_libgcc_floating_mode_supported_p
20260
20261 #undef TARGET_MANGLE_TYPE
20262 #define TARGET_MANGLE_TYPE aarch64_mangle_type
20263
20264 #undef TARGET_MEMORY_MOVE_COST
20265 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
20266
20267 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
20268 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
20269
20270 #undef TARGET_MUST_PASS_IN_STACK
20271 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
20272
20273 /* This target hook should return true if accesses to volatile bitfields
20274    should use the narrowest mode possible.  It should return false if these
20275    accesses should use the bitfield container type.  */
20276 #undef TARGET_NARROW_VOLATILE_BITFIELD
20277 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
20278
20279 #undef  TARGET_OPTION_OVERRIDE
20280 #define TARGET_OPTION_OVERRIDE aarch64_override_options
20281
20282 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
20283 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
20284   aarch64_override_options_after_change
20285
20286 #undef TARGET_OPTION_SAVE
20287 #define TARGET_OPTION_SAVE aarch64_option_save
20288
20289 #undef TARGET_OPTION_RESTORE
20290 #define TARGET_OPTION_RESTORE aarch64_option_restore
20291
20292 #undef TARGET_OPTION_PRINT
20293 #define TARGET_OPTION_PRINT aarch64_option_print
20294
20295 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
20296 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
20297
20298 #undef TARGET_SET_CURRENT_FUNCTION
20299 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
20300
20301 #undef TARGET_PASS_BY_REFERENCE
20302 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
20303
20304 #undef TARGET_PREFERRED_RELOAD_CLASS
20305 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
20306
20307 #undef TARGET_SCHED_REASSOCIATION_WIDTH
20308 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
20309
20310 #undef TARGET_PROMOTED_TYPE
20311 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
20312
20313 #undef TARGET_SECONDARY_RELOAD
20314 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
20315
20316 #undef TARGET_SHIFT_TRUNCATION_MASK
20317 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
20318
20319 #undef TARGET_SETUP_INCOMING_VARARGS
20320 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
20321
20322 #undef TARGET_STRUCT_VALUE_RTX
20323 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
20324
20325 #undef TARGET_REGISTER_MOVE_COST
20326 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
20327
20328 #undef TARGET_RETURN_IN_MEMORY
20329 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
20330
20331 #undef TARGET_RETURN_IN_MSB
20332 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
20333
20334 #undef TARGET_RTX_COSTS
20335 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
20336
20337 #undef TARGET_SCALAR_MODE_SUPPORTED_P
20338 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
20339
20340 #undef TARGET_SCHED_ISSUE_RATE
20341 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
20342
20343 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
20344 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
20345   aarch64_sched_first_cycle_multipass_dfa_lookahead
20346
20347 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
20348 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
20349   aarch64_first_cycle_multipass_dfa_lookahead_guard
20350
20351 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
20352 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
20353   aarch64_get_separate_components
20354
20355 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
20356 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
20357   aarch64_components_for_bb
20358
20359 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
20360 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
20361   aarch64_disqualify_components
20362
20363 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
20364 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
20365   aarch64_emit_prologue_components
20366
20367 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
20368 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
20369   aarch64_emit_epilogue_components
20370
20371 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
20372 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
20373   aarch64_set_handled_components
20374
20375 #undef TARGET_TRAMPOLINE_INIT
20376 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
20377
20378 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
20379 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
20380
20381 #undef TARGET_VECTOR_MODE_SUPPORTED_P
20382 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
20383
20384 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
20385 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
20386   aarch64_builtin_support_vector_misalignment
20387
20388 #undef TARGET_ARRAY_MODE
20389 #define TARGET_ARRAY_MODE aarch64_array_mode
20390
20391 #undef TARGET_ARRAY_MODE_SUPPORTED_P
20392 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
20393
20394 #undef TARGET_VECTORIZE_ADD_STMT_COST
20395 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
20396
20397 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
20398 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
20399   aarch64_builtin_vectorization_cost
20400
20401 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
20402 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
20403
20404 #undef TARGET_VECTORIZE_BUILTINS
20405 #define TARGET_VECTORIZE_BUILTINS
20406
20407 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
20408 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
20409   aarch64_builtin_vectorized_function
20410
20411 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
20412 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
20413   aarch64_autovectorize_vector_sizes
20414
20415 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
20416 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
20417   aarch64_atomic_assign_expand_fenv
20418
20419 /* Section anchor support.  */
20420
20421 #undef TARGET_MIN_ANCHOR_OFFSET
20422 #define TARGET_MIN_ANCHOR_OFFSET -256
20423
20424 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
20425    byte offset; we can do much more for larger data types, but have no way
20426    to determine the size of the access.  We assume accesses are aligned.  */
20427 #undef TARGET_MAX_ANCHOR_OFFSET
20428 #define TARGET_MAX_ANCHOR_OFFSET 4095
20429
20430 #undef TARGET_VECTOR_ALIGNMENT
20431 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
20432
20433 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
20434 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
20435   aarch64_vectorize_preferred_vector_alignment
20436 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
20437 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
20438   aarch64_simd_vector_alignment_reachable
20439
20440 /* vec_perm support.  */
20441
20442 #undef TARGET_VECTORIZE_VEC_PERM_CONST
20443 #define TARGET_VECTORIZE_VEC_PERM_CONST \
20444   aarch64_vectorize_vec_perm_const
20445
20446 #undef TARGET_VECTORIZE_GET_MASK_MODE
20447 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
20448 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
20449 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
20450   aarch64_empty_mask_is_expensive
20451 #undef TARGET_PREFERRED_ELSE_VALUE
20452 #define TARGET_PREFERRED_ELSE_VALUE \
20453   aarch64_preferred_else_value
20454
20455 #undef TARGET_INIT_LIBFUNCS
20456 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
20457
20458 #undef TARGET_FIXED_CONDITION_CODE_REGS
20459 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
20460
20461 #undef TARGET_FLAGS_REGNUM
20462 #define TARGET_FLAGS_REGNUM CC_REGNUM
20463
20464 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
20465 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
20466
20467 #undef TARGET_ASAN_SHADOW_OFFSET
20468 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
20469
20470 #undef TARGET_LEGITIMIZE_ADDRESS
20471 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
20472
20473 #undef TARGET_SCHED_CAN_SPECULATE_INSN
20474 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
20475
20476 #undef TARGET_CAN_USE_DOLOOP_P
20477 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
20478
20479 #undef TARGET_SCHED_ADJUST_PRIORITY
20480 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
20481
20482 #undef TARGET_SCHED_MACRO_FUSION_P
20483 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
20484
20485 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
20486 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
20487
20488 #undef TARGET_SCHED_FUSION_PRIORITY
20489 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
20490
20491 #undef TARGET_UNSPEC_MAY_TRAP_P
20492 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
20493
20494 #undef TARGET_USE_PSEUDO_PIC_REG
20495 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
20496
20497 #undef TARGET_PRINT_OPERAND
20498 #define TARGET_PRINT_OPERAND aarch64_print_operand
20499
20500 #undef TARGET_PRINT_OPERAND_ADDRESS
20501 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
20502
20503 #undef TARGET_OPTAB_SUPPORTED_P
20504 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
20505
20506 #undef TARGET_OMIT_STRUCT_RETURN_REG
20507 #define TARGET_OMIT_STRUCT_RETURN_REG true
20508
20509 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
20510 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
20511   aarch64_dwarf_poly_indeterminate_value
20512
20513 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
20514 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
20515 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
20516
20517 #undef TARGET_HARD_REGNO_NREGS
20518 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
20519 #undef TARGET_HARD_REGNO_MODE_OK
20520 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
20521
20522 #undef TARGET_MODES_TIEABLE_P
20523 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
20524
20525 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
20526 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
20527   aarch64_hard_regno_call_part_clobbered
20528
20529 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
20530 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
20531   aarch64_remove_extra_call_preserved_regs
20532
20533 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
20534 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
20535   aarch64_return_call_with_max_clobbers
20536
20537 #undef TARGET_CONSTANT_ALIGNMENT
20538 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
20539
20540 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
20541 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
20542   aarch64_stack_clash_protection_alloca_probe_range
20543
20544 #undef TARGET_COMPUTE_PRESSURE_CLASSES
20545 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
20546
20547 #undef TARGET_CAN_CHANGE_MODE_CLASS
20548 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
20549
20550 #undef TARGET_SELECT_EARLY_REMAT_MODES
20551 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
20552
20553 #undef TARGET_SPECULATION_SAFE_VALUE
20554 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
20555
20556 #undef TARGET_ESTIMATED_POLY_VALUE
20557 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
20558
20559 #undef TARGET_ATTRIBUTE_TABLE
20560 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
20561
20562 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
20563 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
20564   aarch64_simd_clone_compute_vecsize_and_simdlen
20565
20566 #undef TARGET_SIMD_CLONE_ADJUST
20567 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
20568
20569 #undef TARGET_SIMD_CLONE_USABLE
20570 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
20571
20572 #undef TARGET_COMP_TYPE_ATTRIBUTES
20573 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
20574
20575 #undef TARGET_GET_MULTILIB_ABI_NAME
20576 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
20577
20578 #if CHECKING_P
20579 #undef TARGET_RUN_TARGET_SELFTESTS
20580 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
20581 #endif /* #if CHECKING_P */
20582
20583 #undef TARGET_ASM_POST_CFI_STARTPROC
20584 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
20585
20586 struct gcc_target targetm = TARGET_INITIALIZER;
20587
20588 #include "gt-aarch64.h"