gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2019 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "cgraph.h"
  44 #include "diagnostic.h"
  45 #include "insn-attr.h"
  46 #include "alias.h"
  47 #include "fold-const.h"
  48 #include "stor-layout.h"
  49 #include "calls.h"
  50 #include "varasm.h"
  51 #include "output.h"
  52 #include "flags.h"
  53 #include "explow.h"
  54 #include "expr.h"
  55 #include "reload.h"
  56 #include "langhooks.h"
  57 #include "opts.h"
  58 #include "params.h"
  59 #include "gimplify.h"
  60 #include "dwarf2.h"
  61 #include "gimple-iterator.h"
  62 #include "tree-vectorizer.h"
  63 #include "aarch64-cost-tables.h"
  64 #include "dumpfile.h"
  65 #include "builtins.h"
  66 #include "rtl-iter.h"
  67 #include "tm-constrs.h"
  68 #include "sched-int.h"
  69 #include "target-globals.h"
  70 #include "common/common-target.h"
  71 #include "cfgrtl.h"
  72 #include "selftest.h"
  73 #include "selftest-rtl.h"
  74 #include "rtx-vector-builder.h"
  75 #include "intl.h"
  76 #include "expmed.h"
  77
  78 /* This file should be included last.  */
  79 #include "target-def.h"
  80
  81 /* Defined for convenience.  */
  82 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  83
  84 /* Information about a legitimate vector immediate operand.  */
  85 struct simd_immediate_info
  86 {
  87   enum insn_type { MOV, MVN, INDEX, PTRUE };
  88   enum modifier_type { LSL, MSL };
  89
  90   simd_immediate_info () {}
  91   simd_immediate_info (scalar_float_mode, rtx);
  92   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
  93                        insn_type = MOV, modifier_type = LSL,
  94                        unsigned int = 0);
  95   simd_immediate_info (scalar_mode, rtx, rtx);
  96   simd_immediate_info (scalar_int_mode, aarch64_svpattern);
  97
  98   /* The mode of the elements.  */
  99   scalar_mode elt_mode;
 100
 101   /* The instruction to use to move the immediate into a vector.  */
 102   insn_type insn;
 103
 104   union
 105   {
 106     /* For MOV and MVN.  */
 107     struct
 108     {
 109       /* The value of each element.  */
 110       rtx value;
 111
 112       /* The kind of shift modifier to use, and the number of bits to shift.
 113          This is (LSL, 0) if no shift is needed.  */
 114       modifier_type modifier;
 115       unsigned int shift;
 116     } mov;
 117
 118     /* For INDEX.  */
 119     struct
 120     {
 121       /* The value of the first element and the step to be added for each
 122          subsequent element.  */
 123       rtx base, step;
 124     } index;
 125
 126     /* For PTRUE.  */
 127     aarch64_svpattern pattern;
 128   } u;
 129 };
 130
 131 /* Construct a floating-point immediate in which each element has mode
 132    ELT_MODE_IN and value VALUE_IN.  */
 133 inline simd_immediate_info
 134 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 135   : elt_mode (elt_mode_in), insn (MOV)
 136 {
 137   u.mov.value = value_in;
 138   u.mov.modifier = LSL;
 139   u.mov.shift = 0;
 140 }
 141
 142 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 143    and value VALUE_IN.  The other parameters are as for the structure
 144    fields.  */
 145 inline simd_immediate_info
 146 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 147                        unsigned HOST_WIDE_INT value_in,
 148                        insn_type insn_in, modifier_type modifier_in,
 149                        unsigned int shift_in)
 150   : elt_mode (elt_mode_in), insn (insn_in)
 151 {
 152   u.mov.value = gen_int_mode (value_in, elt_mode_in);
 153   u.mov.modifier = modifier_in;
 154   u.mov.shift = shift_in;
 155 }
 156
 157 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 158    and where element I is equal to BASE_IN + I * STEP_IN.  */
 159 inline simd_immediate_info
 160 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
 161   : elt_mode (elt_mode_in), insn (INDEX)
 162 {
 163   u.index.base = base_in;
 164   u.index.step = step_in;
 165 }
 166
 167 /* Construct a predicate that controls elements of mode ELT_MODE_IN
 168    and has PTRUE pattern PATTERN_IN.  */
 169 inline simd_immediate_info
 170 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 171                        aarch64_svpattern pattern_in)
 172   : elt_mode (elt_mode_in), insn (PTRUE)
 173 {
 174   u.pattern = pattern_in;
 175 }
 176
 177 /* The current code model.  */
 178 enum aarch64_code_model aarch64_cmodel;
 179
 180 /* The number of 64-bit elements in an SVE vector.  */
 181 poly_uint16 aarch64_sve_vg;
 182
 183 #ifdef HAVE_AS_TLS
 184 #undef TARGET_HAVE_TLS
 185 #define TARGET_HAVE_TLS 1
 186 #endif
 187
 188 static bool aarch64_composite_type_p (const_tree, machine_mode);
 189 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 190                                                      const_tree,
 191                                                      machine_mode *, int *,
 192                                                      bool *);
 193 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 194 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 195 static void aarch64_override_options_after_change (void);
 196 static bool aarch64_vector_mode_supported_p (machine_mode);
 197 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 198 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 199                                                          const_tree type,
 200                                                          int misalignment,
 201                                                          bool is_packed);
 202 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 203 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 204                                             aarch64_addr_query_type);
 205 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
 206
 207 /* Major revision number of the ARM Architecture implemented by the target.  */
 208 unsigned aarch64_architecture_version;
 209
 210 /* The processor for which instructions should be scheduled.  */
 211 enum aarch64_processor aarch64_tune = cortexa53;
 212
 213 /* Mask to specify which instruction scheduling options should be used.  */
 214 uint64_t aarch64_tune_flags = 0;
 215
 216 /* Global flag for PC relative loads.  */
 217 bool aarch64_pcrelative_literal_loads;
 218
 219 /* Global flag for whether frame pointer is enabled.  */
 220 bool aarch64_use_frame_pointer;
 221
 222 #define BRANCH_PROTECT_STR_MAX 255
 223 char *accepted_branch_protection_string = NULL;
 224
 225 static enum aarch64_parse_opt_result
 226 aarch64_parse_branch_protection (const char*, char**);
 227
 228 /* Support for command line parsing of boolean flags in the tuning
 229    structures.  */
 230 struct aarch64_flag_desc
 231 {
 232   const char* name;
 233   unsigned int flag;
 234 };
 235
 236 #define AARCH64_FUSION_PAIR(name, internal_name) \
 237   { name, AARCH64_FUSE_##internal_name },
 238 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 239 {
 240   { "none", AARCH64_FUSE_NOTHING },
 241 #include "aarch64-fusion-pairs.def"
 242   { "all", AARCH64_FUSE_ALL },
 243   { NULL, AARCH64_FUSE_NOTHING }
 244 };
 245
 246 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 247   { name, AARCH64_EXTRA_TUNE_##internal_name },
 248 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 249 {
 250   { "none", AARCH64_EXTRA_TUNE_NONE },
 251 #include "aarch64-tuning-flags.def"
 252   { "all", AARCH64_EXTRA_TUNE_ALL },
 253   { NULL, AARCH64_EXTRA_TUNE_NONE }
 254 };
 255
 256 /* Tuning parameters.  */
 257
 258 static const struct cpu_addrcost_table generic_addrcost_table =
 259 {
 260     {
 261       1, /* hi  */
 262       0, /* si  */
 263       0, /* di  */
 264       1, /* ti  */
 265     },
 266   0, /* pre_modify  */
 267   0, /* post_modify  */
 268   0, /* register_offset  */
 269   0, /* register_sextend  */
 270   0, /* register_zextend  */
 271   0 /* imm_offset  */
 272 };
 273
 274 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 275 {
 276     {
 277       0, /* hi  */
 278       0, /* si  */
 279       0, /* di  */
 280       2, /* ti  */
 281     },
 282   0, /* pre_modify  */
 283   0, /* post_modify  */
 284   1, /* register_offset  */
 285   1, /* register_sextend  */
 286   2, /* register_zextend  */
 287   0, /* imm_offset  */
 288 };
 289
 290 static const struct cpu_addrcost_table xgene1_addrcost_table =
 291 {
 292     {
 293       1, /* hi  */
 294       0, /* si  */
 295       0, /* di  */
 296       1, /* ti  */
 297     },
 298   1, /* pre_modify  */
 299   1, /* post_modify  */
 300   0, /* register_offset  */
 301   1, /* register_sextend  */
 302   1, /* register_zextend  */
 303   0, /* imm_offset  */
 304 };
 305
 306 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 307 {
 308     {
 309       1, /* hi  */
 310       1, /* si  */
 311       1, /* di  */
 312       2, /* ti  */
 313     },
 314   0, /* pre_modify  */
 315   0, /* post_modify  */
 316   2, /* register_offset  */
 317   3, /* register_sextend  */
 318   3, /* register_zextend  */
 319   0, /* imm_offset  */
 320 };
 321
 322 static const struct cpu_addrcost_table tsv110_addrcost_table =
 323 {
 324     {
 325       1, /* hi  */
 326       0, /* si  */
 327       0, /* di  */
 328       1, /* ti  */
 329     },
 330   0, /* pre_modify  */
 331   0, /* post_modify  */
 332   0, /* register_offset  */
 333   1, /* register_sextend  */
 334   1, /* register_zextend  */
 335   0, /* imm_offset  */
 336 };
 337
 338 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 339 {
 340     {
 341       1, /* hi  */
 342       1, /* si  */
 343       1, /* di  */
 344       2, /* ti  */
 345     },
 346   1, /* pre_modify  */
 347   1, /* post_modify  */
 348   3, /* register_offset  */
 349   3, /* register_sextend  */
 350   3, /* register_zextend  */
 351   2, /* imm_offset  */
 352 };
 353
 354 static const struct cpu_regmove_cost generic_regmove_cost =
 355 {
 356   1, /* GP2GP  */
 357   /* Avoid the use of slow int<->fp moves for spilling by setting
 358      their cost higher than memmov_cost.  */
 359   5, /* GP2FP  */
 360   5, /* FP2GP  */
 361   2 /* FP2FP  */
 362 };
 363
 364 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 365 {
 366   1, /* GP2GP  */
 367   /* Avoid the use of slow int<->fp moves for spilling by setting
 368      their cost higher than memmov_cost.  */
 369   5, /* GP2FP  */
 370   5, /* FP2GP  */
 371   2 /* FP2FP  */
 372 };
 373
 374 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 375 {
 376   1, /* GP2GP  */
 377   /* Avoid the use of slow int<->fp moves for spilling by setting
 378      their cost higher than memmov_cost.  */
 379   5, /* GP2FP  */
 380   5, /* FP2GP  */
 381   2 /* FP2FP  */
 382 };
 383
 384 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 385 {
 386   1, /* GP2GP  */
 387   /* Avoid the use of slow int<->fp moves for spilling by setting
 388      their cost higher than memmov_cost (actual, 4 and 9).  */
 389   9, /* GP2FP  */
 390   9, /* FP2GP  */
 391   1 /* FP2FP  */
 392 };
 393
 394 static const struct cpu_regmove_cost thunderx_regmove_cost =
 395 {
 396   2, /* GP2GP  */
 397   2, /* GP2FP  */
 398   6, /* FP2GP  */
 399   4 /* FP2FP  */
 400 };
 401
 402 static const struct cpu_regmove_cost xgene1_regmove_cost =
 403 {
 404   1, /* GP2GP  */
 405   /* Avoid the use of slow int<->fp moves for spilling by setting
 406      their cost higher than memmov_cost.  */
 407   8, /* GP2FP  */
 408   8, /* FP2GP  */
 409   2 /* FP2FP  */
 410 };
 411
 412 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 413 {
 414   2, /* GP2GP  */
 415   /* Avoid the use of int<->fp moves for spilling.  */
 416   6, /* GP2FP  */
 417   6, /* FP2GP  */
 418   4 /* FP2FP  */
 419 };
 420
 421 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 422 {
 423   1, /* GP2GP  */
 424   /* Avoid the use of int<->fp moves for spilling.  */
 425   8, /* GP2FP  */
 426   8, /* FP2GP  */
 427   4  /* FP2FP  */
 428 };
 429
 430 static const struct cpu_regmove_cost tsv110_regmove_cost =
 431 {
 432   1, /* GP2GP  */
 433   /* Avoid the use of slow int<->fp moves for spilling by setting
 434      their cost higher than memmov_cost.  */
 435   2, /* GP2FP  */
 436   3, /* FP2GP  */
 437   2  /* FP2FP  */
 438 };
 439
 440 /* Generic costs for vector insn classes.  */
 441 static const struct cpu_vector_cost generic_vector_cost =
 442 {
 443   1, /* scalar_int_stmt_cost  */
 444   1, /* scalar_fp_stmt_cost  */
 445   1, /* scalar_load_cost  */
 446   1, /* scalar_store_cost  */
 447   1, /* vec_int_stmt_cost  */
 448   1, /* vec_fp_stmt_cost  */
 449   2, /* vec_permute_cost  */
 450   1, /* vec_to_scalar_cost  */
 451   1, /* scalar_to_vec_cost  */
 452   1, /* vec_align_load_cost  */
 453   1, /* vec_unalign_load_cost  */
 454   1, /* vec_unalign_store_cost  */
 455   1, /* vec_store_cost  */
 456   3, /* cond_taken_branch_cost  */
 457   1 /* cond_not_taken_branch_cost  */
 458 };
 459
 460 /* QDF24XX costs for vector insn classes.  */
 461 static const struct cpu_vector_cost qdf24xx_vector_cost =
 462 {
 463   1, /* scalar_int_stmt_cost  */
 464   1, /* scalar_fp_stmt_cost  */
 465   1, /* scalar_load_cost  */
 466   1, /* scalar_store_cost  */
 467   1, /* vec_int_stmt_cost  */
 468   3, /* vec_fp_stmt_cost  */
 469   2, /* vec_permute_cost  */
 470   1, /* vec_to_scalar_cost  */
 471   1, /* scalar_to_vec_cost  */
 472   1, /* vec_align_load_cost  */
 473   1, /* vec_unalign_load_cost  */
 474   1, /* vec_unalign_store_cost  */
 475   1, /* vec_store_cost  */
 476   3, /* cond_taken_branch_cost  */
 477   1 /* cond_not_taken_branch_cost  */
 478 };
 479
 480 /* ThunderX costs for vector insn classes.  */
 481 static const struct cpu_vector_cost thunderx_vector_cost =
 482 {
 483   1, /* scalar_int_stmt_cost  */
 484   1, /* scalar_fp_stmt_cost  */
 485   3, /* scalar_load_cost  */
 486   1, /* scalar_store_cost  */
 487   4, /* vec_int_stmt_cost  */
 488   1, /* vec_fp_stmt_cost  */
 489   4, /* vec_permute_cost  */
 490   2, /* vec_to_scalar_cost  */
 491   2, /* scalar_to_vec_cost  */
 492   3, /* vec_align_load_cost  */
 493   5, /* vec_unalign_load_cost  */
 494   5, /* vec_unalign_store_cost  */
 495   1, /* vec_store_cost  */
 496   3, /* cond_taken_branch_cost  */
 497   3 /* cond_not_taken_branch_cost  */
 498 };
 499
 500 static const struct cpu_vector_cost tsv110_vector_cost =
 501 {
 502   1, /* scalar_int_stmt_cost  */
 503   1, /* scalar_fp_stmt_cost  */
 504   5, /* scalar_load_cost  */
 505   1, /* scalar_store_cost  */
 506   2, /* vec_int_stmt_cost  */
 507   2, /* vec_fp_stmt_cost  */
 508   2, /* vec_permute_cost  */
 509   3, /* vec_to_scalar_cost  */
 510   2, /* scalar_to_vec_cost  */
 511   5, /* vec_align_load_cost  */
 512   5, /* vec_unalign_load_cost  */
 513   1, /* vec_unalign_store_cost  */
 514   1, /* vec_store_cost  */
 515   1, /* cond_taken_branch_cost  */
 516   1 /* cond_not_taken_branch_cost  */
 517 };
 518
 519 /* Generic costs for vector insn classes.  */
 520 static const struct cpu_vector_cost cortexa57_vector_cost =
 521 {
 522   1, /* scalar_int_stmt_cost  */
 523   1, /* scalar_fp_stmt_cost  */
 524   4, /* scalar_load_cost  */
 525   1, /* scalar_store_cost  */
 526   2, /* vec_int_stmt_cost  */
 527   2, /* vec_fp_stmt_cost  */
 528   3, /* vec_permute_cost  */
 529   8, /* vec_to_scalar_cost  */
 530   8, /* scalar_to_vec_cost  */
 531   4, /* vec_align_load_cost  */
 532   4, /* vec_unalign_load_cost  */
 533   1, /* vec_unalign_store_cost  */
 534   1, /* vec_store_cost  */
 535   1, /* cond_taken_branch_cost  */
 536   1 /* cond_not_taken_branch_cost  */
 537 };
 538
 539 static const struct cpu_vector_cost exynosm1_vector_cost =
 540 {
 541   1, /* scalar_int_stmt_cost  */
 542   1, /* scalar_fp_stmt_cost  */
 543   5, /* scalar_load_cost  */
 544   1, /* scalar_store_cost  */
 545   3, /* vec_int_stmt_cost  */
 546   3, /* vec_fp_stmt_cost  */
 547   3, /* vec_permute_cost  */
 548   3, /* vec_to_scalar_cost  */
 549   3, /* scalar_to_vec_cost  */
 550   5, /* vec_align_load_cost  */
 551   5, /* vec_unalign_load_cost  */
 552   1, /* vec_unalign_store_cost  */
 553   1, /* vec_store_cost  */
 554   1, /* cond_taken_branch_cost  */
 555   1 /* cond_not_taken_branch_cost  */
 556 };
 557
 558 /* Generic costs for vector insn classes.  */
 559 static const struct cpu_vector_cost xgene1_vector_cost =
 560 {
 561   1, /* scalar_int_stmt_cost  */
 562   1, /* scalar_fp_stmt_cost  */
 563   5, /* scalar_load_cost  */
 564   1, /* scalar_store_cost  */
 565   2, /* vec_int_stmt_cost  */
 566   2, /* vec_fp_stmt_cost  */
 567   2, /* vec_permute_cost  */
 568   4, /* vec_to_scalar_cost  */
 569   4, /* scalar_to_vec_cost  */
 570   10, /* vec_align_load_cost  */
 571   10, /* vec_unalign_load_cost  */
 572   2, /* vec_unalign_store_cost  */
 573   2, /* vec_store_cost  */
 574   2, /* cond_taken_branch_cost  */
 575   1 /* cond_not_taken_branch_cost  */
 576 };
 577
 578 /* Costs for vector insn classes for Vulcan.  */
 579 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 580 {
 581   1, /* scalar_int_stmt_cost  */
 582   6, /* scalar_fp_stmt_cost  */
 583   4, /* scalar_load_cost  */
 584   1, /* scalar_store_cost  */
 585   5, /* vec_int_stmt_cost  */
 586   6, /* vec_fp_stmt_cost  */
 587   3, /* vec_permute_cost  */
 588   6, /* vec_to_scalar_cost  */
 589   5, /* scalar_to_vec_cost  */
 590   8, /* vec_align_load_cost  */
 591   8, /* vec_unalign_load_cost  */
 592   4, /* vec_unalign_store_cost  */
 593   4, /* vec_store_cost  */
 594   2, /* cond_taken_branch_cost  */
 595   1  /* cond_not_taken_branch_cost  */
 596 };
 597
 598 /* Generic costs for branch instructions.  */
 599 static const struct cpu_branch_cost generic_branch_cost =
 600 {
 601   1,  /* Predictable.  */
 602   3   /* Unpredictable.  */
 603 };
 604
 605 /* Generic approximation modes.  */
 606 static const cpu_approx_modes generic_approx_modes =
 607 {
 608   AARCH64_APPROX_NONE,  /* division  */
 609   AARCH64_APPROX_NONE,  /* sqrt  */
 610   AARCH64_APPROX_NONE   /* recip_sqrt  */
 611 };
 612
 613 /* Approximation modes for Exynos M1.  */
 614 static const cpu_approx_modes exynosm1_approx_modes =
 615 {
 616   AARCH64_APPROX_NONE,  /* division  */
 617   AARCH64_APPROX_ALL,   /* sqrt  */
 618   AARCH64_APPROX_ALL    /* recip_sqrt  */
 619 };
 620
 621 /* Approximation modes for X-Gene 1.  */
 622 static const cpu_approx_modes xgene1_approx_modes =
 623 {
 624   AARCH64_APPROX_NONE,  /* division  */
 625   AARCH64_APPROX_NONE,  /* sqrt  */
 626   AARCH64_APPROX_ALL    /* recip_sqrt  */
 627 };
 628
 629 /* Generic prefetch settings (which disable prefetch).  */
 630 static const cpu_prefetch_tune generic_prefetch_tune =
 631 {
 632   0,                    /* num_slots  */
 633   -1,                   /* l1_cache_size  */
 634   -1,                   /* l1_cache_line_size  */
 635   -1,                   /* l2_cache_size  */
 636   true,                 /* prefetch_dynamic_strides */
 637   -1,                   /* minimum_stride */
 638   -1                    /* default_opt_level  */
 639 };
 640
 641 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 642 {
 643   0,                    /* num_slots  */
 644   -1,                   /* l1_cache_size  */
 645   64,                   /* l1_cache_line_size  */
 646   -1,                   /* l2_cache_size  */
 647   true,                 /* prefetch_dynamic_strides */
 648   -1,                   /* minimum_stride */
 649   -1                    /* default_opt_level  */
 650 };
 651
 652 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 653 {
 654   4,                    /* num_slots  */
 655   32,                   /* l1_cache_size  */
 656   64,                   /* l1_cache_line_size  */
 657   512,                  /* l2_cache_size  */
 658   false,                /* prefetch_dynamic_strides */
 659   2048,                 /* minimum_stride */
 660   3                     /* default_opt_level  */
 661 };
 662
 663 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 664 {
 665   8,                    /* num_slots  */
 666   32,                   /* l1_cache_size  */
 667   128,                  /* l1_cache_line_size  */
 668   16*1024,              /* l2_cache_size  */
 669   true,                 /* prefetch_dynamic_strides */
 670   -1,                   /* minimum_stride */
 671   3                     /* default_opt_level  */
 672 };
 673
 674 static const cpu_prefetch_tune thunderx_prefetch_tune =
 675 {
 676   8,                    /* num_slots  */
 677   32,                   /* l1_cache_size  */
 678   128,                  /* l1_cache_line_size  */
 679   -1,                   /* l2_cache_size  */
 680   true,                 /* prefetch_dynamic_strides */
 681   -1,                   /* minimum_stride */
 682   -1                    /* default_opt_level  */
 683 };
 684
 685 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 686 {
 687   8,                    /* num_slots  */
 688   32,                   /* l1_cache_size  */
 689   64,                   /* l1_cache_line_size  */
 690   256,                  /* l2_cache_size  */
 691   true,                 /* prefetch_dynamic_strides */
 692   -1,                   /* minimum_stride */
 693   -1                    /* default_opt_level  */
 694 };
 695
 696 static const cpu_prefetch_tune tsv110_prefetch_tune =
 697 {
 698   0,                    /* num_slots  */
 699   64,                   /* l1_cache_size  */
 700   64,                   /* l1_cache_line_size  */
 701   512,                  /* l2_cache_size  */
 702   true,                 /* prefetch_dynamic_strides */
 703   -1,                   /* minimum_stride */
 704   -1                    /* default_opt_level  */
 705 };
 706
 707 static const cpu_prefetch_tune xgene1_prefetch_tune =
 708 {
 709   8,                    /* num_slots  */
 710   32,                   /* l1_cache_size  */
 711   64,                   /* l1_cache_line_size  */
 712   256,                  /* l2_cache_size  */
 713   true,                 /* prefetch_dynamic_strides */
 714   -1,                   /* minimum_stride */
 715   -1                    /* default_opt_level  */
 716 };
 717
 718 static const struct tune_params generic_tunings =
 719 {
 720   &cortexa57_extra_costs,
 721   &generic_addrcost_table,
 722   &generic_regmove_cost,
 723   &generic_vector_cost,
 724   &generic_branch_cost,
 725   &generic_approx_modes,
 726   SVE_NOT_IMPLEMENTED, /* sve_width  */
 727   4, /* memmov_cost  */
 728   2, /* issue_rate  */
 729   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 730   "16:12",      /* function_align.  */
 731   "4",  /* jump_align.  */
 732   "8",  /* loop_align.  */
 733   2,    /* int_reassoc_width.  */
 734   4,    /* fp_reassoc_width.  */
 735   1,    /* vec_reassoc_width.  */
 736   2,    /* min_div_recip_mul_sf.  */
 737   2,    /* min_div_recip_mul_df.  */
 738   0,    /* max_case_values.  */
 739   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 740   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 741   &generic_prefetch_tune
 742 };
 743
 744 static const struct tune_params cortexa35_tunings =
 745 {
 746   &cortexa53_extra_costs,
 747   &generic_addrcost_table,
 748   &cortexa53_regmove_cost,
 749   &generic_vector_cost,
 750   &generic_branch_cost,
 751   &generic_approx_modes,
 752   SVE_NOT_IMPLEMENTED, /* sve_width  */
 753   4, /* memmov_cost  */
 754   1, /* issue_rate  */
 755   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 756    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 757   "16", /* function_align.  */
 758   "4",  /* jump_align.  */
 759   "8",  /* loop_align.  */
 760   2,    /* int_reassoc_width.  */
 761   4,    /* fp_reassoc_width.  */
 762   1,    /* vec_reassoc_width.  */
 763   2,    /* min_div_recip_mul_sf.  */
 764   2,    /* min_div_recip_mul_df.  */
 765   0,    /* max_case_values.  */
 766   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 767   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 768   &generic_prefetch_tune
 769 };
 770
 771 static const struct tune_params cortexa53_tunings =
 772 {
 773   &cortexa53_extra_costs,
 774   &generic_addrcost_table,
 775   &cortexa53_regmove_cost,
 776   &generic_vector_cost,
 777   &generic_branch_cost,
 778   &generic_approx_modes,
 779   SVE_NOT_IMPLEMENTED, /* sve_width  */
 780   4, /* memmov_cost  */
 781   2, /* issue_rate  */
 782   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 783    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 784   "16", /* function_align.  */
 785   "4",  /* jump_align.  */
 786   "8",  /* loop_align.  */
 787   2,    /* int_reassoc_width.  */
 788   4,    /* fp_reassoc_width.  */
 789   1,    /* vec_reassoc_width.  */
 790   2,    /* min_div_recip_mul_sf.  */
 791   2,    /* min_div_recip_mul_df.  */
 792   0,    /* max_case_values.  */
 793   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 794   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 795   &generic_prefetch_tune
 796 };
 797
 798 static const struct tune_params cortexa57_tunings =
 799 {
 800   &cortexa57_extra_costs,
 801   &generic_addrcost_table,
 802   &cortexa57_regmove_cost,
 803   &cortexa57_vector_cost,
 804   &generic_branch_cost,
 805   &generic_approx_modes,
 806   SVE_NOT_IMPLEMENTED, /* sve_width  */
 807   4, /* memmov_cost  */
 808   3, /* issue_rate  */
 809   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 810    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 811   "16", /* function_align.  */
 812   "4",  /* jump_align.  */
 813   "8",  /* loop_align.  */
 814   2,    /* int_reassoc_width.  */
 815   4,    /* fp_reassoc_width.  */
 816   1,    /* vec_reassoc_width.  */
 817   2,    /* min_div_recip_mul_sf.  */
 818   2,    /* min_div_recip_mul_df.  */
 819   0,    /* max_case_values.  */
 820   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 821   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 822   &generic_prefetch_tune
 823 };
 824
 825 static const struct tune_params cortexa72_tunings =
 826 {
 827   &cortexa57_extra_costs,
 828   &generic_addrcost_table,
 829   &cortexa57_regmove_cost,
 830   &cortexa57_vector_cost,
 831   &generic_branch_cost,
 832   &generic_approx_modes,
 833   SVE_NOT_IMPLEMENTED, /* sve_width  */
 834   4, /* memmov_cost  */
 835   3, /* issue_rate  */
 836   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 837    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 838   "16", /* function_align.  */
 839   "4",  /* jump_align.  */
 840   "8",  /* loop_align.  */
 841   2,    /* int_reassoc_width.  */
 842   4,    /* fp_reassoc_width.  */
 843   1,    /* vec_reassoc_width.  */
 844   2,    /* min_div_recip_mul_sf.  */
 845   2,    /* min_div_recip_mul_df.  */
 846   0,    /* max_case_values.  */
 847   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 848   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 849   &generic_prefetch_tune
 850 };
 851
 852 static const struct tune_params cortexa73_tunings =
 853 {
 854   &cortexa57_extra_costs,
 855   &generic_addrcost_table,
 856   &cortexa57_regmove_cost,
 857   &cortexa57_vector_cost,
 858   &generic_branch_cost,
 859   &generic_approx_modes,
 860   SVE_NOT_IMPLEMENTED, /* sve_width  */
 861   4, /* memmov_cost.  */
 862   2, /* issue_rate.  */
 863   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 864    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 865   "16", /* function_align.  */
 866   "4",  /* jump_align.  */
 867   "8",  /* loop_align.  */
 868   2,    /* int_reassoc_width.  */
 869   4,    /* fp_reassoc_width.  */
 870   1,    /* vec_reassoc_width.  */
 871   2,    /* min_div_recip_mul_sf.  */
 872   2,    /* min_div_recip_mul_df.  */
 873   0,    /* max_case_values.  */
 874   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 875   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 876   &generic_prefetch_tune
 877 };
 878
 879
 880
 881 static const struct tune_params exynosm1_tunings =
 882 {
 883   &exynosm1_extra_costs,
 884   &exynosm1_addrcost_table,
 885   &exynosm1_regmove_cost,
 886   &exynosm1_vector_cost,
 887   &generic_branch_cost,
 888   &exynosm1_approx_modes,
 889   SVE_NOT_IMPLEMENTED, /* sve_width  */
 890   4,    /* memmov_cost  */
 891   3,    /* issue_rate  */
 892   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 893   "4",  /* function_align.  */
 894   "4",  /* jump_align.  */
 895   "4",  /* loop_align.  */
 896   2,    /* int_reassoc_width.  */
 897   4,    /* fp_reassoc_width.  */
 898   1,    /* vec_reassoc_width.  */
 899   2,    /* min_div_recip_mul_sf.  */
 900   2,    /* min_div_recip_mul_df.  */
 901   48,   /* max_case_values.  */
 902   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 903   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 904   &exynosm1_prefetch_tune
 905 };
 906
 907 static const struct tune_params thunderxt88_tunings =
 908 {
 909   &thunderx_extra_costs,
 910   &generic_addrcost_table,
 911   &thunderx_regmove_cost,
 912   &thunderx_vector_cost,
 913   &generic_branch_cost,
 914   &generic_approx_modes,
 915   SVE_NOT_IMPLEMENTED, /* sve_width  */
 916   6, /* memmov_cost  */
 917   2, /* issue_rate  */
 918   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 919   "8",  /* function_align.  */
 920   "8",  /* jump_align.  */
 921   "8",  /* loop_align.  */
 922   2,    /* int_reassoc_width.  */
 923   4,    /* fp_reassoc_width.  */
 924   1,    /* vec_reassoc_width.  */
 925   2,    /* min_div_recip_mul_sf.  */
 926   2,    /* min_div_recip_mul_df.  */
 927   0,    /* max_case_values.  */
 928   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 929   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 930   &thunderxt88_prefetch_tune
 931 };
 932
 933 static const struct tune_params thunderx_tunings =
 934 {
 935   &thunderx_extra_costs,
 936   &generic_addrcost_table,
 937   &thunderx_regmove_cost,
 938   &thunderx_vector_cost,
 939   &generic_branch_cost,
 940   &generic_approx_modes,
 941   SVE_NOT_IMPLEMENTED, /* sve_width  */
 942   6, /* memmov_cost  */
 943   2, /* issue_rate  */
 944   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 945   "8",  /* function_align.  */
 946   "8",  /* jump_align.  */
 947   "8",  /* loop_align.  */
 948   2,    /* int_reassoc_width.  */
 949   4,    /* fp_reassoc_width.  */
 950   1,    /* vec_reassoc_width.  */
 951   2,    /* min_div_recip_mul_sf.  */
 952   2,    /* min_div_recip_mul_df.  */
 953   0,    /* max_case_values.  */
 954   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 955   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 956    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 957   &thunderx_prefetch_tune
 958 };
 959
 960 static const struct tune_params tsv110_tunings =
 961 {
 962   &tsv110_extra_costs,
 963   &tsv110_addrcost_table,
 964   &tsv110_regmove_cost,
 965   &tsv110_vector_cost,
 966   &generic_branch_cost,
 967   &generic_approx_modes,
 968   SVE_NOT_IMPLEMENTED, /* sve_width  */
 969   4,    /* memmov_cost  */
 970   4,    /* issue_rate  */
 971   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
 972    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 973   "16", /* function_align.  */
 974   "4",  /* jump_align.  */
 975   "8",  /* loop_align.  */
 976   2,    /* int_reassoc_width.  */
 977   4,    /* fp_reassoc_width.  */
 978   1,    /* vec_reassoc_width.  */
 979   2,    /* min_div_recip_mul_sf.  */
 980   2,    /* min_div_recip_mul_df.  */
 981   0,    /* max_case_values.  */
 982   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 983   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
 984   &tsv110_prefetch_tune
 985 };
 986
 987 static const struct tune_params xgene1_tunings =
 988 {
 989   &xgene1_extra_costs,
 990   &xgene1_addrcost_table,
 991   &xgene1_regmove_cost,
 992   &xgene1_vector_cost,
 993   &generic_branch_cost,
 994   &xgene1_approx_modes,
 995   SVE_NOT_IMPLEMENTED, /* sve_width  */
 996   6, /* memmov_cost  */
 997   4, /* issue_rate  */
 998   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 999   "16", /* function_align.  */
1000   "16", /* jump_align.  */
1001   "16", /* loop_align.  */
1002   2,    /* int_reassoc_width.  */
1003   4,    /* fp_reassoc_width.  */
1004   1,    /* vec_reassoc_width.  */
1005   2,    /* min_div_recip_mul_sf.  */
1006   2,    /* min_div_recip_mul_df.  */
1007   17,   /* max_case_values.  */
1008   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1009   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1010   &xgene1_prefetch_tune
1011 };
1012
1013 static const struct tune_params emag_tunings =
1014 {
1015   &xgene1_extra_costs,
1016   &xgene1_addrcost_table,
1017   &xgene1_regmove_cost,
1018   &xgene1_vector_cost,
1019   &generic_branch_cost,
1020   &xgene1_approx_modes,
1021   SVE_NOT_IMPLEMENTED,
1022   6, /* memmov_cost  */
1023   4, /* issue_rate  */
1024   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1025   "16", /* function_align.  */
1026   "16", /* jump_align.  */
1027   "16", /* loop_align.  */
1028   2,    /* int_reassoc_width.  */
1029   4,    /* fp_reassoc_width.  */
1030   1,    /* vec_reassoc_width.  */
1031   2,    /* min_div_recip_mul_sf.  */
1032   2,    /* min_div_recip_mul_df.  */
1033   17,   /* max_case_values.  */
1034   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1035   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1036   &xgene1_prefetch_tune
1037 };
1038
1039 static const struct tune_params qdf24xx_tunings =
1040 {
1041   &qdf24xx_extra_costs,
1042   &qdf24xx_addrcost_table,
1043   &qdf24xx_regmove_cost,
1044   &qdf24xx_vector_cost,
1045   &generic_branch_cost,
1046   &generic_approx_modes,
1047   SVE_NOT_IMPLEMENTED, /* sve_width  */
1048   4, /* memmov_cost  */
1049   4, /* issue_rate  */
1050   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1051    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1052   "16", /* function_align.  */
1053   "8",  /* jump_align.  */
1054   "16", /* loop_align.  */
1055   2,    /* int_reassoc_width.  */
1056   4,    /* fp_reassoc_width.  */
1057   1,    /* vec_reassoc_width.  */
1058   2,    /* min_div_recip_mul_sf.  */
1059   2,    /* min_div_recip_mul_df.  */
1060   0,    /* max_case_values.  */
1061   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1062   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1063   &qdf24xx_prefetch_tune
1064 };
1065
1066 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1067    for now.  */
1068 static const struct tune_params saphira_tunings =
1069 {
1070   &generic_extra_costs,
1071   &generic_addrcost_table,
1072   &generic_regmove_cost,
1073   &generic_vector_cost,
1074   &generic_branch_cost,
1075   &generic_approx_modes,
1076   SVE_NOT_IMPLEMENTED, /* sve_width  */
1077   4, /* memmov_cost  */
1078   4, /* issue_rate  */
1079   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1080    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1081   "16", /* function_align.  */
1082   "8",  /* jump_align.  */
1083   "16", /* loop_align.  */
1084   2,    /* int_reassoc_width.  */
1085   4,    /* fp_reassoc_width.  */
1086   1,    /* vec_reassoc_width.  */
1087   2,    /* min_div_recip_mul_sf.  */
1088   2,    /* min_div_recip_mul_df.  */
1089   0,    /* max_case_values.  */
1090   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1091   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1092   &generic_prefetch_tune
1093 };
1094
1095 static const struct tune_params thunderx2t99_tunings =
1096 {
1097   &thunderx2t99_extra_costs,
1098   &thunderx2t99_addrcost_table,
1099   &thunderx2t99_regmove_cost,
1100   &thunderx2t99_vector_cost,
1101   &generic_branch_cost,
1102   &generic_approx_modes,
1103   SVE_NOT_IMPLEMENTED, /* sve_width  */
1104   4, /* memmov_cost.  */
1105   4, /* issue_rate.  */
1106   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1107    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
1108   "16", /* function_align.  */
1109   "8",  /* jump_align.  */
1110   "16", /* loop_align.  */
1111   3,    /* int_reassoc_width.  */
1112   2,    /* fp_reassoc_width.  */
1113   2,    /* vec_reassoc_width.  */
1114   2,    /* min_div_recip_mul_sf.  */
1115   2,    /* min_div_recip_mul_df.  */
1116   0,    /* max_case_values.  */
1117   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1118   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1119   &thunderx2t99_prefetch_tune
1120 };
1121
1122 static const struct tune_params neoversen1_tunings =
1123 {
1124   &cortexa57_extra_costs,
1125   &generic_addrcost_table,
1126   &generic_regmove_cost,
1127   &cortexa57_vector_cost,
1128   &generic_branch_cost,
1129   &generic_approx_modes,
1130   SVE_NOT_IMPLEMENTED, /* sve_width  */
1131   4, /* memmov_cost  */
1132   3, /* issue_rate  */
1133   AARCH64_FUSE_AES_AESMC, /* fusible_ops  */
1134   "32:16",      /* function_align.  */
1135   "32:16",      /* jump_align.  */
1136   "32:16",      /* loop_align.  */
1137   2,    /* int_reassoc_width.  */
1138   4,    /* fp_reassoc_width.  */
1139   2,    /* vec_reassoc_width.  */
1140   2,    /* min_div_recip_mul_sf.  */
1141   2,    /* min_div_recip_mul_df.  */
1142   0,    /* max_case_values.  */
1143   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1144   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1145   &generic_prefetch_tune
1146 };
1147
1148 /* Support for fine-grained override of the tuning structures.  */
1149 struct aarch64_tuning_override_function
1150 {
1151   const char* name;
1152   void (*parse_override)(const char*, struct tune_params*);
1153 };
1154
1155 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1156 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1157 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1158
1159 static const struct aarch64_tuning_override_function
1160 aarch64_tuning_override_functions[] =
1161 {
1162   { "fuse", aarch64_parse_fuse_string },
1163   { "tune", aarch64_parse_tune_string },
1164   { "sve_width", aarch64_parse_sve_width_string },
1165   { NULL, NULL }
1166 };
1167
1168 /* A processor implementing AArch64.  */
1169 struct processor
1170 {
1171   const char *const name;
1172   enum aarch64_processor ident;
1173   enum aarch64_processor sched_core;
1174   enum aarch64_arch arch;
1175   unsigned architecture_version;
1176   const uint64_t flags;
1177   const struct tune_params *const tune;
1178 };
1179
1180 /* Architectures implementing AArch64.  */
1181 static const struct processor all_architectures[] =
1182 {
1183 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1184   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1185 #include "aarch64-arches.def"
1186   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1187 };
1188
1189 /* Processor cores implementing AArch64.  */
1190 static const struct processor all_cores[] =
1191 {
1192 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1193   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
1194   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
1195   FLAGS, &COSTS##_tunings},
1196 #include "aarch64-cores.def"
1197   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1198     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1199   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1200 };
1201
1202
1203 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1204    handling code or by target attributes.  */
1205 static const struct processor *selected_arch;
1206 static const struct processor *selected_cpu;
1207 static const struct processor *selected_tune;
1208
1209 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1210
1211 /* The current tuning set.  */
1212 struct tune_params aarch64_tune_params = generic_tunings;
1213
1214 /* Table of machine attributes.  */
1215 static const struct attribute_spec aarch64_attribute_table[] =
1216 {
1217   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1218        affects_type_identity, handler, exclude } */
1219   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,  NULL, NULL },
1220   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
1221 };
1222
1223 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1224
1225 /* An ISA extension in the co-processor and main instruction set space.  */
1226 struct aarch64_option_extension
1227 {
1228   const char *const name;
1229   const unsigned long flags_on;
1230   const unsigned long flags_off;
1231 };
1232
1233 typedef enum aarch64_cond_code
1234 {
1235   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1236   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1237   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1238 }
1239 aarch64_cc;
1240
1241 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1242
1243 struct aarch64_branch_protect_type
1244 {
1245   /* The type's name that the user passes to the branch-protection option
1246     string.  */
1247   const char* name;
1248   /* Function to handle the protection type and set global variables.
1249     First argument is the string token corresponding with this type and the
1250     second argument is the next token in the option string.
1251     Return values:
1252     * AARCH64_PARSE_OK: Handling was sucessful.
1253     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1254       should print an error.
1255     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1256       own error.  */
1257   enum aarch64_parse_opt_result (*handler)(char*, char*);
1258   /* A list of types that can follow this type in the option string.  */
1259   const aarch64_branch_protect_type* subtypes;
1260   unsigned int num_subtypes;
1261 };
1262
1263 static enum aarch64_parse_opt_result
1264 aarch64_handle_no_branch_protection (char* str, char* rest)
1265 {
1266   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1267   aarch64_enable_bti = 0;
1268   if (rest)
1269     {
1270       error ("unexpected %<%s%> after %<%s%>", rest, str);
1271       return AARCH64_PARSE_INVALID_FEATURE;
1272     }
1273   return AARCH64_PARSE_OK;
1274 }
1275
1276 static enum aarch64_parse_opt_result
1277 aarch64_handle_standard_branch_protection (char* str, char* rest)
1278 {
1279   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1280   aarch64_ra_sign_key = AARCH64_KEY_A;
1281   aarch64_enable_bti = 1;
1282   if (rest)
1283     {
1284       error ("unexpected %<%s%> after %<%s%>", rest, str);
1285       return AARCH64_PARSE_INVALID_FEATURE;
1286     }
1287   return AARCH64_PARSE_OK;
1288 }
1289
1290 static enum aarch64_parse_opt_result
1291 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1292                                     char* rest ATTRIBUTE_UNUSED)
1293 {
1294   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1295   aarch64_ra_sign_key = AARCH64_KEY_A;
1296   return AARCH64_PARSE_OK;
1297 }
1298
1299 static enum aarch64_parse_opt_result
1300 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1301                               char* rest ATTRIBUTE_UNUSED)
1302 {
1303   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1304   return AARCH64_PARSE_OK;
1305 }
1306
1307 static enum aarch64_parse_opt_result
1308 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1309                               char* rest ATTRIBUTE_UNUSED)
1310 {
1311   aarch64_ra_sign_key = AARCH64_KEY_B;
1312   return AARCH64_PARSE_OK;
1313 }
1314
1315 static enum aarch64_parse_opt_result
1316 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1317                                     char* rest ATTRIBUTE_UNUSED)
1318 {
1319   aarch64_enable_bti = 1;
1320   return AARCH64_PARSE_OK;
1321 }
1322
1323 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1324   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1325   { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1326   { NULL, NULL, NULL, 0 }
1327 };
1328
1329 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1330   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1331   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1332   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1333     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1334   { "bti", aarch64_handle_bti_protection, NULL, 0 },
1335   { NULL, NULL, NULL, 0 }
1336 };
1337
1338 /* The condition codes of the processor, and the inverse function.  */
1339 static const char * const aarch64_condition_codes[] =
1340 {
1341   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1342   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1343 };
1344
1345 /* The preferred condition codes for SVE conditions.  */
1346 static const char *const aarch64_sve_condition_codes[] =
1347 {
1348   "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1349   "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1350 };
1351
1352 /* Return the assembly token for svpattern value VALUE.  */
1353
1354 static const char *
1355 svpattern_token (enum aarch64_svpattern pattern)
1356 {
1357   switch (pattern)
1358     {
1359 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1360     AARCH64_FOR_SVPATTERN (CASE)
1361 #undef CASE
1362     case AARCH64_NUM_SVPATTERNS:
1363       break;
1364     }
1365   gcc_unreachable ();
1366 }
1367
1368 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1369 const char *
1370 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1371                         const char * branch_format)
1372 {
1373     rtx_code_label * tmp_label = gen_label_rtx ();
1374     char label_buf[256];
1375     char buffer[128];
1376     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1377                                  CODE_LABEL_NUMBER (tmp_label));
1378     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1379     rtx dest_label = operands[pos_label];
1380     operands[pos_label] = tmp_label;
1381
1382     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1383     output_asm_insn (buffer, operands);
1384
1385     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1386     operands[pos_label] = dest_label;
1387     output_asm_insn (buffer, operands);
1388     return "";
1389 }
1390
1391 void
1392 aarch64_err_no_fpadvsimd (machine_mode mode)
1393 {
1394   if (TARGET_GENERAL_REGS_ONLY)
1395     if (FLOAT_MODE_P (mode))
1396       error ("%qs is incompatible with the use of floating-point types",
1397              "-mgeneral-regs-only");
1398     else
1399       error ("%qs is incompatible with the use of vector types",
1400              "-mgeneral-regs-only");
1401   else
1402     if (FLOAT_MODE_P (mode))
1403       error ("%qs feature modifier is incompatible with the use of"
1404              " floating-point types", "+nofp");
1405     else
1406       error ("%qs feature modifier is incompatible with the use of"
1407              " vector types", "+nofp");
1408 }
1409
1410 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1411    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1412    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1413    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1414    and GENERAL_REGS is lower than the memory cost (in this case the best class
1415    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1416    cost results in bad allocations with many redundant int<->FP moves which
1417    are expensive on various cores.
1418    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1419    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1420    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1421    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1422    The result of this is that it is no longer inefficient to have a higher
1423    memory move cost than the register move cost.
1424 */
1425
1426 static reg_class_t
1427 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1428                                          reg_class_t best_class)
1429 {
1430   machine_mode mode;
1431
1432   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1433       || !reg_class_subset_p (FP_REGS, allocno_class))
1434     return allocno_class;
1435
1436   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1437       || !reg_class_subset_p (FP_REGS, best_class))
1438     return best_class;
1439
1440   mode = PSEUDO_REGNO_MODE (regno);
1441   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1442 }
1443
1444 static unsigned int
1445 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1446 {
1447   if (GET_MODE_UNIT_SIZE (mode) == 4)
1448     return aarch64_tune_params.min_div_recip_mul_sf;
1449   return aarch64_tune_params.min_div_recip_mul_df;
1450 }
1451
1452 /* Return the reassociation width of treeop OPC with mode MODE.  */
1453 static int
1454 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1455 {
1456   if (VECTOR_MODE_P (mode))
1457     return aarch64_tune_params.vec_reassoc_width;
1458   if (INTEGRAL_MODE_P (mode))
1459     return aarch64_tune_params.int_reassoc_width;
1460   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1461   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1462     return aarch64_tune_params.fp_reassoc_width;
1463   return 1;
1464 }
1465
1466 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1467 unsigned
1468 aarch64_dbx_register_number (unsigned regno)
1469 {
1470    if (GP_REGNUM_P (regno))
1471      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1472    else if (regno == SP_REGNUM)
1473      return AARCH64_DWARF_SP;
1474    else if (FP_REGNUM_P (regno))
1475      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1476    else if (PR_REGNUM_P (regno))
1477      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1478    else if (regno == VG_REGNUM)
1479      return AARCH64_DWARF_VG;
1480
1481    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1482       equivalent DWARF register.  */
1483    return DWARF_FRAME_REGISTERS;
1484 }
1485
1486 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1487    integer, otherwise return X unmodified.  */
1488 static rtx
1489 aarch64_bit_representation (rtx x)
1490 {
1491   if (CONST_DOUBLE_P (x))
1492     x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
1493   return x;
1494 }
1495
1496 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1497 static bool
1498 aarch64_advsimd_struct_mode_p (machine_mode mode)
1499 {
1500   return (TARGET_SIMD
1501           && (mode == OImode || mode == CImode || mode == XImode));
1502 }
1503
1504 /* Return true if MODE is an SVE predicate mode.  */
1505 static bool
1506 aarch64_sve_pred_mode_p (machine_mode mode)
1507 {
1508   return (TARGET_SVE
1509           && (mode == VNx16BImode
1510               || mode == VNx8BImode
1511               || mode == VNx4BImode
1512               || mode == VNx2BImode));
1513 }
1514
1515 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1516 const unsigned int VEC_ADVSIMD  = 1;
1517 const unsigned int VEC_SVE_DATA = 2;
1518 const unsigned int VEC_SVE_PRED = 4;
1519 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1520    a structure of 2, 3 or 4 vectors.  */
1521 const unsigned int VEC_STRUCT   = 8;
1522 /* Useful combinations of the above.  */
1523 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1524 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1525
1526 /* Return a set of flags describing the vector properties of mode MODE.
1527    Ignore modes that are not supported by the current target.  */
1528 static unsigned int
1529 aarch64_classify_vector_mode (machine_mode mode)
1530 {
1531   if (aarch64_advsimd_struct_mode_p (mode))
1532     return VEC_ADVSIMD | VEC_STRUCT;
1533
1534   if (aarch64_sve_pred_mode_p (mode))
1535     return VEC_SVE_PRED;
1536
1537   /* Make the decision based on the mode's enum value rather than its
1538      properties, so that we keep the correct classification regardless
1539      of -msve-vector-bits.  */
1540   switch (mode)
1541     {
1542     /* Single SVE vectors.  */
1543     case E_VNx16QImode:
1544     case E_VNx8HImode:
1545     case E_VNx4SImode:
1546     case E_VNx2DImode:
1547     case E_VNx8HFmode:
1548     case E_VNx4SFmode:
1549     case E_VNx2DFmode:
1550       return TARGET_SVE ? VEC_SVE_DATA : 0;
1551
1552     /* x2 SVE vectors.  */
1553     case E_VNx32QImode:
1554     case E_VNx16HImode:
1555     case E_VNx8SImode:
1556     case E_VNx4DImode:
1557     case E_VNx16HFmode:
1558     case E_VNx8SFmode:
1559     case E_VNx4DFmode:
1560     /* x3 SVE vectors.  */
1561     case E_VNx48QImode:
1562     case E_VNx24HImode:
1563     case E_VNx12SImode:
1564     case E_VNx6DImode:
1565     case E_VNx24HFmode:
1566     case E_VNx12SFmode:
1567     case E_VNx6DFmode:
1568     /* x4 SVE vectors.  */
1569     case E_VNx64QImode:
1570     case E_VNx32HImode:
1571     case E_VNx16SImode:
1572     case E_VNx8DImode:
1573     case E_VNx32HFmode:
1574     case E_VNx16SFmode:
1575     case E_VNx8DFmode:
1576       return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
1577
1578     /* 64-bit Advanced SIMD vectors.  */
1579     case E_V8QImode:
1580     case E_V4HImode:
1581     case E_V2SImode:
1582     /* ...E_V1DImode doesn't exist.  */
1583     case E_V4HFmode:
1584     case E_V2SFmode:
1585     case E_V1DFmode:
1586     /* 128-bit Advanced SIMD vectors.  */
1587     case E_V16QImode:
1588     case E_V8HImode:
1589     case E_V4SImode:
1590     case E_V2DImode:
1591     case E_V8HFmode:
1592     case E_V4SFmode:
1593     case E_V2DFmode:
1594       return TARGET_SIMD ? VEC_ADVSIMD : 0;
1595
1596     default:
1597       return 0;
1598     }
1599 }
1600
1601 /* Return true if MODE is any of the data vector modes, including
1602    structure modes.  */
1603 static bool
1604 aarch64_vector_data_mode_p (machine_mode mode)
1605 {
1606   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1607 }
1608
1609 /* Return true if MODE is any form of SVE mode, including predicates,
1610    vectors and structures.  */
1611 bool
1612 aarch64_sve_mode_p (machine_mode mode)
1613 {
1614   return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
1615 }
1616
1617 /* Return true if MODE is an SVE data vector mode; either a single vector
1618    or a structure of vectors.  */
1619 static bool
1620 aarch64_sve_data_mode_p (machine_mode mode)
1621 {
1622   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1623 }
1624
1625 /* Implement target hook TARGET_ARRAY_MODE.  */
1626 static opt_machine_mode
1627 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1628 {
1629   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1630       && IN_RANGE (nelems, 2, 4))
1631     return mode_for_vector (GET_MODE_INNER (mode),
1632                             GET_MODE_NUNITS (mode) * nelems);
1633
1634   return opt_machine_mode ();
1635 }
1636
1637 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1638 static bool
1639 aarch64_array_mode_supported_p (machine_mode mode,
1640                                 unsigned HOST_WIDE_INT nelems)
1641 {
1642   if (TARGET_SIMD
1643       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1644           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1645       && (nelems >= 2 && nelems <= 4))
1646     return true;
1647
1648   return false;
1649 }
1650
1651 /* Return the SVE predicate mode to use for elements that have
1652    ELEM_NBYTES bytes, if such a mode exists.  */
1653
1654 opt_machine_mode
1655 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1656 {
1657   if (TARGET_SVE)
1658     {
1659       if (elem_nbytes == 1)
1660         return VNx16BImode;
1661       if (elem_nbytes == 2)
1662         return VNx8BImode;
1663       if (elem_nbytes == 4)
1664         return VNx4BImode;
1665       if (elem_nbytes == 8)
1666         return VNx2BImode;
1667     }
1668   return opt_machine_mode ();
1669 }
1670
1671 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1672
1673 static opt_machine_mode
1674 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1675 {
1676   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1677     {
1678       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1679       machine_mode pred_mode;
1680       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1681         return pred_mode;
1682     }
1683
1684   return default_get_mask_mode (nunits, nbytes);
1685 }
1686
1687 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE.  */
1688
1689 static opt_machine_mode
1690 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
1691 {
1692   enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
1693                             ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
1694   machine_mode mode;
1695   FOR_EACH_MODE_IN_CLASS (mode, mclass)
1696     if (inner_mode == GET_MODE_INNER (mode)
1697         && known_eq (nunits, GET_MODE_NUNITS (mode))
1698         && aarch64_sve_data_mode_p (mode))
1699       return mode;
1700   return opt_machine_mode ();
1701 }
1702
1703 /* Return the integer element mode associated with SVE mode MODE.  */
1704
1705 static scalar_int_mode
1706 aarch64_sve_element_int_mode (machine_mode mode)
1707 {
1708   unsigned int elt_bits = vector_element_size (BITS_PER_SVE_VECTOR,
1709                                                GET_MODE_NUNITS (mode));
1710   return int_mode_for_size (elt_bits, 0).require ();
1711 }
1712
1713 /* Return the integer vector mode associated with SVE mode MODE.
1714    Unlike mode_for_int_vector, this can handle the case in which
1715    MODE is a predicate (and thus has a different total size).  */
1716
1717 static machine_mode
1718 aarch64_sve_int_mode (machine_mode mode)
1719 {
1720   scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
1721   return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
1722 }
1723
1724 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
1725    prefer to use the first arithmetic operand as the else value if
1726    the else value doesn't matter, since that exactly matches the SVE
1727    destructive merging form.  For ternary operations we could either
1728    pick the first operand and use FMAD-like instructions or the last
1729    operand and use FMLA-like instructions; the latter seems more
1730    natural.  */
1731
1732 static tree
1733 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1734 {
1735   return nops == 3 ? ops[2] : ops[0];
1736 }
1737
1738 /* Implement TARGET_HARD_REGNO_NREGS.  */
1739
1740 static unsigned int
1741 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1742 {
1743   /* ??? Logically we should only need to provide a value when
1744      HARD_REGNO_MODE_OK says that the combination is valid,
1745      but at the moment we need to handle all modes.  Just ignore
1746      any runtime parts for registers that can't store them.  */
1747   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1748   switch (aarch64_regno_regclass (regno))
1749     {
1750     case FP_REGS:
1751     case FP_LO_REGS:
1752     case FP_LO8_REGS:
1753       if (aarch64_sve_data_mode_p (mode))
1754         return exact_div (GET_MODE_SIZE (mode),
1755                           BYTES_PER_SVE_VECTOR).to_constant ();
1756       return CEIL (lowest_size, UNITS_PER_VREG);
1757     case PR_REGS:
1758     case PR_LO_REGS:
1759     case PR_HI_REGS:
1760       return 1;
1761     default:
1762       return CEIL (lowest_size, UNITS_PER_WORD);
1763     }
1764   gcc_unreachable ();
1765 }
1766
1767 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1768
1769 static bool
1770 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1771 {
1772   if (GET_MODE_CLASS (mode) == MODE_CC)
1773     return regno == CC_REGNUM;
1774
1775   if (regno == VG_REGNUM)
1776     /* This must have the same size as _Unwind_Word.  */
1777     return mode == DImode;
1778
1779   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1780   if (vec_flags & VEC_SVE_PRED)
1781     return PR_REGNUM_P (regno);
1782
1783   if (PR_REGNUM_P (regno))
1784     return 0;
1785
1786   if (regno == SP_REGNUM)
1787     /* The purpose of comparing with ptr_mode is to support the
1788        global register variable associated with the stack pointer
1789        register via the syntax of asm ("wsp") in ILP32.  */
1790     return mode == Pmode || mode == ptr_mode;
1791
1792   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1793     return mode == Pmode;
1794
1795   if (GP_REGNUM_P (regno))
1796     {
1797       if (known_le (GET_MODE_SIZE (mode), 8))
1798         return true;
1799       else if (known_le (GET_MODE_SIZE (mode), 16))
1800         return (regno & 1) == 0;
1801     }
1802   else if (FP_REGNUM_P (regno))
1803     {
1804       if (vec_flags & VEC_STRUCT)
1805         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1806       else
1807         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1808     }
1809
1810   return false;
1811 }
1812
1813 /* Return true if this is a definition of a vectorized simd function.  */
1814
1815 static bool
1816 aarch64_simd_decl_p (tree fndecl)
1817 {
1818   tree fntype;
1819
1820   if (fndecl == NULL)
1821     return false;
1822   fntype = TREE_TYPE (fndecl);
1823   if (fntype == NULL)
1824     return false;
1825
1826   /* Functions with the aarch64_vector_pcs attribute use the simd ABI.  */
1827   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1828     return true;
1829
1830   return false;
1831 }
1832
1833 /* Return the mode a register save/restore should use.  DImode for integer
1834    registers, DFmode for FP registers in non-SIMD functions (they only save
1835    the bottom half of a 128 bit register), or TFmode for FP registers in
1836    SIMD functions.  */
1837
1838 static machine_mode
1839 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1840 {
1841   return GP_REGNUM_P (regno)
1842            ? E_DImode
1843            : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1844 }
1845
1846 /* Return true if the instruction is a call to a SIMD function, false
1847    if it is not a SIMD function or if we do not know anything about
1848    the function.  */
1849
1850 static bool
1851 aarch64_simd_call_p (rtx_insn *insn)
1852 {
1853   rtx symbol;
1854   rtx call;
1855   tree fndecl;
1856
1857   gcc_assert (CALL_P (insn));
1858   call = get_call_rtx_from (insn);
1859   symbol = XEXP (XEXP (call, 0), 0);
1860   if (GET_CODE (symbol) != SYMBOL_REF)
1861     return false;
1862   fndecl = SYMBOL_REF_DECL (symbol);
1863   if (!fndecl)
1864     return false;
1865
1866   return aarch64_simd_decl_p (fndecl);
1867 }
1868
1869 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS.  If INSN calls
1870    a function that uses the SIMD ABI, take advantage of the extra
1871    call-preserved registers that the ABI provides.  */
1872
1873 void
1874 aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
1875                                           HARD_REG_SET *return_set)
1876 {
1877   if (aarch64_simd_call_p (insn))
1878     {
1879       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1880         if (FP_SIMD_SAVED_REGNUM_P (regno))
1881           CLEAR_HARD_REG_BIT (*return_set, regno);
1882     }
1883 }
1884
1885 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1886    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1887    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1888
1889 static bool
1890 aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
1891                                         machine_mode mode)
1892 {
1893   bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
1894   return FP_REGNUM_P (regno)
1895          && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
1896 }
1897
1898 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS.  */
1899
1900 rtx_insn *
1901 aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
1902 {
1903   gcc_assert (CALL_P (call_1) && CALL_P (call_2));
1904
1905   if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
1906     return call_1;
1907   else
1908     return call_2;
1909 }
1910
1911 /* Implement REGMODE_NATURAL_SIZE.  */
1912 poly_uint64
1913 aarch64_regmode_natural_size (machine_mode mode)
1914 {
1915   /* The natural size for SVE data modes is one SVE data vector,
1916      and similarly for predicates.  We can't independently modify
1917      anything smaller than that.  */
1918   /* ??? For now, only do this for variable-width SVE registers.
1919      Doing it for constant-sized registers breaks lower-subreg.c.  */
1920   /* ??? And once that's fixed, we should probably have similar
1921      code for Advanced SIMD.  */
1922   if (!aarch64_sve_vg.is_constant ())
1923     {
1924       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1925       if (vec_flags & VEC_SVE_PRED)
1926         return BYTES_PER_SVE_PRED;
1927       if (vec_flags & VEC_SVE_DATA)
1928         return BYTES_PER_SVE_VECTOR;
1929     }
1930   return UNITS_PER_WORD;
1931 }
1932
1933 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1934 machine_mode
1935 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1936                                      machine_mode mode)
1937 {
1938   /* The predicate mode determines which bits are significant and
1939      which are "don't care".  Decreasing the number of lanes would
1940      lose data while increasing the number of lanes would make bits
1941      unnecessarily significant.  */
1942   if (PR_REGNUM_P (regno))
1943     return mode;
1944   if (known_ge (GET_MODE_SIZE (mode), 4))
1945     return mode;
1946   else
1947     return SImode;
1948 }
1949
1950 /* Return true if I's bits are consecutive ones from the MSB.  */
1951 bool
1952 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1953 {
1954   return exact_log2 (-i) != HOST_WIDE_INT_M1;
1955 }
1956
1957 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1958    that strcpy from constants will be faster.  */
1959
1960 static HOST_WIDE_INT
1961 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1962 {
1963   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1964     return MAX (align, BITS_PER_WORD);
1965   return align;
1966 }
1967
1968 /* Return true if calls to DECL should be treated as
1969    long-calls (ie called via a register).  */
1970 static bool
1971 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1972 {
1973   return false;
1974 }
1975
1976 /* Return true if calls to symbol-ref SYM should be treated as
1977    long-calls (ie called via a register).  */
1978 bool
1979 aarch64_is_long_call_p (rtx sym)
1980 {
1981   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1982 }
1983
1984 /* Return true if calls to symbol-ref SYM should not go through
1985    plt stubs.  */
1986
1987 bool
1988 aarch64_is_noplt_call_p (rtx sym)
1989 {
1990   const_tree decl = SYMBOL_REF_DECL (sym);
1991
1992   if (flag_pic
1993       && decl
1994       && (!flag_plt
1995           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1996       && !targetm.binds_local_p (decl))
1997     return true;
1998
1999   return false;
2000 }
2001
2002 /* Return true if the offsets to a zero/sign-extract operation
2003    represent an expression that matches an extend operation.  The
2004    operands represent the paramters from
2005
2006    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
2007 bool
2008 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
2009                                 rtx extract_imm)
2010 {
2011   HOST_WIDE_INT mult_val, extract_val;
2012
2013   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
2014     return false;
2015
2016   mult_val = INTVAL (mult_imm);
2017   extract_val = INTVAL (extract_imm);
2018
2019   if (extract_val > 8
2020       && extract_val < GET_MODE_BITSIZE (mode)
2021       && exact_log2 (extract_val & ~7) > 0
2022       && (extract_val & 7) <= 4
2023       && mult_val == (1 << (extract_val & 7)))
2024     return true;
2025
2026   return false;
2027 }
2028
2029 /* Emit an insn that's a simple single-set.  Both the operands must be
2030    known to be valid.  */
2031 inline static rtx_insn *
2032 emit_set_insn (rtx x, rtx y)
2033 {
2034   return emit_insn (gen_rtx_SET (x, y));
2035 }
2036
2037 /* X and Y are two things to compare using CODE.  Emit the compare insn and
2038    return the rtx for register 0 in the proper mode.  */
2039 rtx
2040 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2041 {
2042   machine_mode cmp_mode = GET_MODE (x);
2043   machine_mode cc_mode;
2044   rtx cc_reg;
2045
2046   if (cmp_mode == TImode)
2047     {
2048       gcc_assert (code == NE);
2049
2050       cc_mode = CCmode;
2051       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2052
2053       rtx x_lo = operand_subword (x, 0, 0, TImode);
2054       rtx y_lo = operand_subword (y, 0, 0, TImode);
2055       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
2056
2057       rtx x_hi = operand_subword (x, 1, 0, TImode);
2058       rtx y_hi = operand_subword (y, 1, 0, TImode);
2059       emit_insn (gen_ccmpdi (cc_reg, cc_reg, x_hi, y_hi,
2060                              gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
2061                              GEN_INT (AARCH64_EQ)));
2062     }
2063   else
2064     {
2065       cc_mode = SELECT_CC_MODE (code, x, y);
2066       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2067       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
2068     }
2069   return cc_reg;
2070 }
2071
2072 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
2073
2074 static rtx
2075 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2076                                   machine_mode y_mode)
2077 {
2078   if (y_mode == E_QImode || y_mode == E_HImode)
2079     {
2080       if (CONST_INT_P (y))
2081         y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2082       else
2083         {
2084           rtx t, cc_reg;
2085           machine_mode cc_mode;
2086
2087           t = gen_rtx_ZERO_EXTEND (SImode, y);
2088           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2089           cc_mode = CC_SWPmode;
2090           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2091           emit_set_insn (cc_reg, t);
2092           return cc_reg;
2093         }
2094     }
2095
2096   if (!aarch64_plus_operand (y, y_mode))
2097     y = force_reg (y_mode, y);
2098
2099   return aarch64_gen_compare_reg (code, x, y);
2100 }
2101
2102 /* Build the SYMBOL_REF for __tls_get_addr.  */
2103
2104 static GTY(()) rtx tls_get_addr_libfunc;
2105
2106 rtx
2107 aarch64_tls_get_addr (void)
2108 {
2109   if (!tls_get_addr_libfunc)
2110     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2111   return tls_get_addr_libfunc;
2112 }
2113
2114 /* Return the TLS model to use for ADDR.  */
2115
2116 static enum tls_model
2117 tls_symbolic_operand_type (rtx addr)
2118 {
2119   enum tls_model tls_kind = TLS_MODEL_NONE;
2120   if (GET_CODE (addr) == CONST)
2121     {
2122       poly_int64 addend;
2123       rtx sym = strip_offset (addr, &addend);
2124       if (GET_CODE (sym) == SYMBOL_REF)
2125         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2126     }
2127   else if (GET_CODE (addr) == SYMBOL_REF)
2128     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2129
2130   return tls_kind;
2131 }
2132
2133 /* We'll allow lo_sum's in addresses in our legitimate addresses
2134    so that combine would take care of combining addresses where
2135    necessary, but for generation purposes, we'll generate the address
2136    as :
2137    RTL                               Absolute
2138    tmp = hi (symbol_ref);            adrp  x1, foo
2139    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
2140                                      nop
2141
2142    PIC                               TLS
2143    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
2144    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
2145                                      bl   __tls_get_addr
2146                                      nop
2147
2148    Load TLS symbol, depending on TLS mechanism and TLS access model.
2149
2150    Global Dynamic - Traditional TLS:
2151    adrp tmp, :tlsgd:imm
2152    add  dest, tmp, #:tlsgd_lo12:imm
2153    bl   __tls_get_addr
2154
2155    Global Dynamic - TLS Descriptors:
2156    adrp dest, :tlsdesc:imm
2157    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
2158    add  dest, dest, #:tlsdesc_lo12:imm
2159    blr  tmp
2160    mrs  tp, tpidr_el0
2161    add  dest, dest, tp
2162
2163    Initial Exec:
2164    mrs  tp, tpidr_el0
2165    adrp tmp, :gottprel:imm
2166    ldr  dest, [tmp, #:gottprel_lo12:imm]
2167    add  dest, dest, tp
2168
2169    Local Exec:
2170    mrs  tp, tpidr_el0
2171    add  t0, tp, #:tprel_hi12:imm, lsl #12
2172    add  t0, t0, #:tprel_lo12_nc:imm
2173 */
2174
2175 static void
2176 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2177                                    enum aarch64_symbol_type type)
2178 {
2179   switch (type)
2180     {
2181     case SYMBOL_SMALL_ABSOLUTE:
2182       {
2183         /* In ILP32, the mode of dest can be either SImode or DImode.  */
2184         rtx tmp_reg = dest;
2185         machine_mode mode = GET_MODE (dest);
2186
2187         gcc_assert (mode == Pmode || mode == ptr_mode);
2188
2189         if (can_create_pseudo_p ())
2190           tmp_reg = gen_reg_rtx (mode);
2191
2192         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2193         emit_insn (gen_add_losym (dest, tmp_reg, imm));
2194         return;
2195       }
2196
2197     case SYMBOL_TINY_ABSOLUTE:
2198       emit_insn (gen_rtx_SET (dest, imm));
2199       return;
2200
2201     case SYMBOL_SMALL_GOT_28K:
2202       {
2203         machine_mode mode = GET_MODE (dest);
2204         rtx gp_rtx = pic_offset_table_rtx;
2205         rtx insn;
2206         rtx mem;
2207
2208         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2209            here before rtl expand.  Tree IVOPT will generate rtl pattern to
2210            decide rtx costs, in which case pic_offset_table_rtx is not
2211            initialized.  For that case no need to generate the first adrp
2212            instruction as the final cost for global variable access is
2213            one instruction.  */
2214         if (gp_rtx != NULL)
2215           {
2216             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2217                using the page base as GOT base, the first page may be wasted,
2218                in the worst scenario, there is only 28K space for GOT).
2219
2220                The generate instruction sequence for accessing global variable
2221                is:
2222
2223                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2224
2225                Only one instruction needed. But we must initialize
2226                pic_offset_table_rtx properly.  We generate initialize insn for
2227                every global access, and allow CSE to remove all redundant.
2228
2229                The final instruction sequences will look like the following
2230                for multiply global variables access.
2231
2232                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2233
2234                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2235                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2236                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2237                  ...  */
2238
2239             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2240             crtl->uses_pic_offset_table = 1;
2241             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2242
2243             if (mode != GET_MODE (gp_rtx))
2244              gp_rtx = gen_lowpart (mode, gp_rtx);
2245
2246           }
2247
2248         if (mode == ptr_mode)
2249           {
2250             if (mode == DImode)
2251               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2252             else
2253               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2254
2255             mem = XVECEXP (SET_SRC (insn), 0, 0);
2256           }
2257         else
2258           {
2259             gcc_assert (mode == Pmode);
2260
2261             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2262             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2263           }
2264
2265         /* The operand is expected to be MEM.  Whenever the related insn
2266            pattern changed, above code which calculate mem should be
2267            updated.  */
2268         gcc_assert (GET_CODE (mem) == MEM);
2269         MEM_READONLY_P (mem) = 1;
2270         MEM_NOTRAP_P (mem) = 1;
2271         emit_insn (insn);
2272         return;
2273       }
2274
2275     case SYMBOL_SMALL_GOT_4G:
2276       {
2277         /* In ILP32, the mode of dest can be either SImode or DImode,
2278            while the got entry is always of SImode size.  The mode of
2279            dest depends on how dest is used: if dest is assigned to a
2280            pointer (e.g. in the memory), it has SImode; it may have
2281            DImode if dest is dereferenced to access the memeory.
2282            This is why we have to handle three different ldr_got_small
2283            patterns here (two patterns for ILP32).  */
2284
2285         rtx insn;
2286         rtx mem;
2287         rtx tmp_reg = dest;
2288         machine_mode mode = GET_MODE (dest);
2289
2290         if (can_create_pseudo_p ())
2291           tmp_reg = gen_reg_rtx (mode);
2292
2293         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2294         if (mode == ptr_mode)
2295           {
2296             if (mode == DImode)
2297               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2298             else
2299               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2300
2301             mem = XVECEXP (SET_SRC (insn), 0, 0);
2302           }
2303         else
2304           {
2305             gcc_assert (mode == Pmode);
2306
2307             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2308             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2309           }
2310
2311         gcc_assert (GET_CODE (mem) == MEM);
2312         MEM_READONLY_P (mem) = 1;
2313         MEM_NOTRAP_P (mem) = 1;
2314         emit_insn (insn);
2315         return;
2316       }
2317
2318     case SYMBOL_SMALL_TLSGD:
2319       {
2320         rtx_insn *insns;
2321         machine_mode mode = GET_MODE (dest);
2322         rtx result = gen_rtx_REG (mode, R0_REGNUM);
2323
2324         start_sequence ();
2325         if (TARGET_ILP32)
2326           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2327         else
2328           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2329         insns = get_insns ();
2330         end_sequence ();
2331
2332         RTL_CONST_CALL_P (insns) = 1;
2333         emit_libcall_block (insns, dest, result, imm);
2334         return;
2335       }
2336
2337     case SYMBOL_SMALL_TLSDESC:
2338       {
2339         machine_mode mode = GET_MODE (dest);
2340         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2341         rtx tp;
2342
2343         gcc_assert (mode == Pmode || mode == ptr_mode);
2344
2345         /* In ILP32, the got entry is always of SImode size.  Unlike
2346            small GOT, the dest is fixed at reg 0.  */
2347         if (TARGET_ILP32)
2348           emit_insn (gen_tlsdesc_small_si (imm));
2349         else
2350           emit_insn (gen_tlsdesc_small_di (imm));
2351         tp = aarch64_load_tp (NULL);
2352
2353         if (mode != Pmode)
2354           tp = gen_lowpart (mode, tp);
2355
2356         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2357         if (REG_P (dest))
2358           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2359         return;
2360       }
2361
2362     case SYMBOL_SMALL_TLSIE:
2363       {
2364         /* In ILP32, the mode of dest can be either SImode or DImode,
2365            while the got entry is always of SImode size.  The mode of
2366            dest depends on how dest is used: if dest is assigned to a
2367            pointer (e.g. in the memory), it has SImode; it may have
2368            DImode if dest is dereferenced to access the memeory.
2369            This is why we have to handle three different tlsie_small
2370            patterns here (two patterns for ILP32).  */
2371         machine_mode mode = GET_MODE (dest);
2372         rtx tmp_reg = gen_reg_rtx (mode);
2373         rtx tp = aarch64_load_tp (NULL);
2374
2375         if (mode == ptr_mode)
2376           {
2377             if (mode == DImode)
2378               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2379             else
2380               {
2381                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2382                 tp = gen_lowpart (mode, tp);
2383               }
2384           }
2385         else
2386           {
2387             gcc_assert (mode == Pmode);
2388             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2389           }
2390
2391         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2392         if (REG_P (dest))
2393           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2394         return;
2395       }
2396
2397     case SYMBOL_TLSLE12:
2398     case SYMBOL_TLSLE24:
2399     case SYMBOL_TLSLE32:
2400     case SYMBOL_TLSLE48:
2401       {
2402         machine_mode mode = GET_MODE (dest);
2403         rtx tp = aarch64_load_tp (NULL);
2404
2405         if (mode != Pmode)
2406           tp = gen_lowpart (mode, tp);
2407
2408         switch (type)
2409           {
2410           case SYMBOL_TLSLE12:
2411             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2412                         (dest, tp, imm));
2413             break;
2414           case SYMBOL_TLSLE24:
2415             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2416                         (dest, tp, imm));
2417           break;
2418           case SYMBOL_TLSLE32:
2419             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2420                         (dest, imm));
2421             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2422                         (dest, dest, tp));
2423           break;
2424           case SYMBOL_TLSLE48:
2425             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2426                         (dest, imm));
2427             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2428                         (dest, dest, tp));
2429             break;
2430           default:
2431             gcc_unreachable ();
2432           }
2433
2434         if (REG_P (dest))
2435           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2436         return;
2437       }
2438
2439     case SYMBOL_TINY_GOT:
2440       emit_insn (gen_ldr_got_tiny (dest, imm));
2441       return;
2442
2443     case SYMBOL_TINY_TLSIE:
2444       {
2445         machine_mode mode = GET_MODE (dest);
2446         rtx tp = aarch64_load_tp (NULL);
2447
2448         if (mode == ptr_mode)
2449           {
2450             if (mode == DImode)
2451               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2452             else
2453               {
2454                 tp = gen_lowpart (mode, tp);
2455                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2456               }
2457           }
2458         else
2459           {
2460             gcc_assert (mode == Pmode);
2461             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2462           }
2463
2464         if (REG_P (dest))
2465           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2466         return;
2467       }
2468
2469     default:
2470       gcc_unreachable ();
2471     }
2472 }
2473
2474 /* Emit a move from SRC to DEST.  Assume that the move expanders can
2475    handle all moves if !can_create_pseudo_p ().  The distinction is
2476    important because, unlike emit_move_insn, the move expanders know
2477    how to force Pmode objects into the constant pool even when the
2478    constant pool address is not itself legitimate.  */
2479 static rtx
2480 aarch64_emit_move (rtx dest, rtx src)
2481 {
2482   return (can_create_pseudo_p ()
2483           ? emit_move_insn (dest, src)
2484           : emit_move_insn_1 (dest, src));
2485 }
2486
2487 /* Apply UNOPTAB to OP and store the result in DEST.  */
2488
2489 static void
2490 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2491 {
2492   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2493   if (dest != tmp)
2494     emit_move_insn (dest, tmp);
2495 }
2496
2497 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
2498
2499 static void
2500 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2501 {
2502   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2503                           OPTAB_DIRECT);
2504   if (dest != tmp)
2505     emit_move_insn (dest, tmp);
2506 }
2507
2508 /* Split a 128-bit move operation into two 64-bit move operations,
2509    taking care to handle partial overlap of register to register
2510    copies.  Special cases are needed when moving between GP regs and
2511    FP regs.  SRC can be a register, constant or memory; DST a register
2512    or memory.  If either operand is memory it must not have any side
2513    effects.  */
2514 void
2515 aarch64_split_128bit_move (rtx dst, rtx src)
2516 {
2517   rtx dst_lo, dst_hi;
2518   rtx src_lo, src_hi;
2519
2520   machine_mode mode = GET_MODE (dst);
2521
2522   gcc_assert (mode == TImode || mode == TFmode);
2523   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2524   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2525
2526   if (REG_P (dst) && REG_P (src))
2527     {
2528       int src_regno = REGNO (src);
2529       int dst_regno = REGNO (dst);
2530
2531       /* Handle FP <-> GP regs.  */
2532       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2533         {
2534           src_lo = gen_lowpart (word_mode, src);
2535           src_hi = gen_highpart (word_mode, src);
2536
2537           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2538           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2539           return;
2540         }
2541       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2542         {
2543           dst_lo = gen_lowpart (word_mode, dst);
2544           dst_hi = gen_highpart (word_mode, dst);
2545
2546           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2547           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2548           return;
2549         }
2550     }
2551
2552   dst_lo = gen_lowpart (word_mode, dst);
2553   dst_hi = gen_highpart (word_mode, dst);
2554   src_lo = gen_lowpart (word_mode, src);
2555   src_hi = gen_highpart_mode (word_mode, mode, src);
2556
2557   /* At most one pairing may overlap.  */
2558   if (reg_overlap_mentioned_p (dst_lo, src_hi))
2559     {
2560       aarch64_emit_move (dst_hi, src_hi);
2561       aarch64_emit_move (dst_lo, src_lo);
2562     }
2563   else
2564     {
2565       aarch64_emit_move (dst_lo, src_lo);
2566       aarch64_emit_move (dst_hi, src_hi);
2567     }
2568 }
2569
2570 bool
2571 aarch64_split_128bit_move_p (rtx dst, rtx src)
2572 {
2573   return (! REG_P (src)
2574           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2575 }
2576
2577 /* Split a complex SIMD combine.  */
2578
2579 void
2580 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2581 {
2582   machine_mode src_mode = GET_MODE (src1);
2583   machine_mode dst_mode = GET_MODE (dst);
2584
2585   gcc_assert (VECTOR_MODE_P (dst_mode));
2586   gcc_assert (register_operand (dst, dst_mode)
2587               && register_operand (src1, src_mode)
2588               && register_operand (src2, src_mode));
2589
2590   emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2591   return;
2592 }
2593
2594 /* Split a complex SIMD move.  */
2595
2596 void
2597 aarch64_split_simd_move (rtx dst, rtx src)
2598 {
2599   machine_mode src_mode = GET_MODE (src);
2600   machine_mode dst_mode = GET_MODE (dst);
2601
2602   gcc_assert (VECTOR_MODE_P (dst_mode));
2603
2604   if (REG_P (dst) && REG_P (src))
2605     {
2606       gcc_assert (VECTOR_MODE_P (src_mode));
2607       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2608     }
2609 }
2610
2611 bool
2612 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2613                               machine_mode ymode, rtx y)
2614 {
2615   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2616   gcc_assert (r != NULL);
2617   return rtx_equal_p (x, r);
2618 }
2619
2620 /* Return TARGET if it is nonnull and a register of mode MODE.
2621    Otherwise, return a fresh register of mode MODE if we can,
2622    or TARGET reinterpreted as MODE if we can't.  */
2623
2624 static rtx
2625 aarch64_target_reg (rtx target, machine_mode mode)
2626 {
2627   if (target && REG_P (target) && GET_MODE (target) == mode)
2628     return target;
2629   if (!can_create_pseudo_p ())
2630     {
2631       gcc_assert (target);
2632       return gen_lowpart (mode, target);
2633     }
2634   return gen_reg_rtx (mode);
2635 }
2636
2637 /* Return a register that contains the constant in BUILDER, given that
2638    the constant is a legitimate move operand.  Use TARGET as the register
2639    if it is nonnull and convenient.  */
2640
2641 static rtx
2642 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
2643 {
2644   rtx src = builder.build ();
2645   target = aarch64_target_reg (target, GET_MODE (src));
2646   emit_insn (gen_rtx_SET (target, src));
2647   return target;
2648 }
2649
2650 static rtx
2651 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2652 {
2653   if (can_create_pseudo_p ())
2654     return force_reg (mode, value);
2655   else
2656     {
2657       gcc_assert (x);
2658       aarch64_emit_move (x, value);
2659       return x;
2660     }
2661 }
2662
2663 /* Return true if predicate value X is a constant in which every element
2664    is a CONST_INT.  When returning true, describe X in BUILDER as a VNx16BI
2665    value, i.e. as a predicate in which all bits are significant.  */
2666
2667 static bool
2668 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
2669 {
2670   if (GET_CODE (x) != CONST_VECTOR)
2671     return false;
2672
2673   unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
2674                                              GET_MODE_NUNITS (GET_MODE (x)));
2675   unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
2676   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
2677   builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
2678
2679   unsigned int nelts = const_vector_encoded_nelts (x);
2680   for (unsigned int i = 0; i < nelts; ++i)
2681     {
2682       rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
2683       if (!CONST_INT_P (elt))
2684         return false;
2685
2686       builder.quick_push (elt);
2687       for (unsigned int j = 1; j < factor; ++j)
2688         builder.quick_push (const0_rtx);
2689     }
2690   builder.finalize ();
2691   return true;
2692 }
2693
2694 /* BUILDER contains a predicate constant of mode VNx16BI.  Return the
2695    widest predicate element size it can have (that is, the largest size
2696    for which each element would still be 0 or 1).  */
2697
2698 unsigned int
2699 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
2700 {
2701   /* Start with the most optimistic assumption: that we only need
2702      one bit per pattern.  This is what we will use if only the first
2703      bit in each pattern is ever set.  */
2704   unsigned int mask = GET_MODE_SIZE (DImode);
2705   mask |= builder.npatterns ();
2706
2707   /* Look for set bits.  */
2708   unsigned int nelts = builder.encoded_nelts ();
2709   for (unsigned int i = 1; i < nelts; ++i)
2710     if (INTVAL (builder.elt (i)) != 0)
2711       {
2712         if (i & 1)
2713           return 1;
2714         mask |= i;
2715       }
2716   return mask & -mask;
2717 }
2718
2719 /* BUILDER is a predicate constant of mode VNx16BI.  Consider the value
2720    that the constant would have with predicate element size ELT_SIZE
2721    (ignoring the upper bits in each element) and return:
2722
2723    * -1 if all bits are set
2724    * N if the predicate has N leading set bits followed by all clear bits
2725    * 0 if the predicate does not have any of these forms.  */
2726
2727 int
2728 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
2729                               unsigned int elt_size)
2730 {
2731   /* If nelts_per_pattern is 3, we have set bits followed by clear bits
2732      followed by set bits.  */
2733   if (builder.nelts_per_pattern () == 3)
2734     return 0;
2735
2736   /* Skip over leading set bits.  */
2737   unsigned int nelts = builder.encoded_nelts ();
2738   unsigned int i = 0;
2739   for (; i < nelts; i += elt_size)
2740     if (INTVAL (builder.elt (i)) == 0)
2741       break;
2742   unsigned int vl = i / elt_size;
2743
2744   /* Check for the all-true case.  */
2745   if (i == nelts)
2746     return -1;
2747
2748   /* If nelts_per_pattern is 1, then either VL is zero, or we have a
2749      repeating pattern of set bits followed by clear bits.  */
2750   if (builder.nelts_per_pattern () != 2)
2751     return 0;
2752
2753   /* We have a "foreground" value and a duplicated "background" value.
2754      If the background might repeat and the last set bit belongs to it,
2755      we might have set bits followed by clear bits followed by set bits.  */
2756   if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
2757     return 0;
2758
2759   /* Make sure that the rest are all clear.  */
2760   for (; i < nelts; i += elt_size)
2761     if (INTVAL (builder.elt (i)) != 0)
2762       return 0;
2763
2764   return vl;
2765 }
2766
2767 /* See if there is an svpattern that encodes an SVE predicate of mode
2768    PRED_MODE in which the first VL bits are set and the rest are clear.
2769    Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
2770    A VL of -1 indicates an all-true vector.  */
2771
2772 aarch64_svpattern
2773 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
2774 {
2775   if (vl < 0)
2776     return AARCH64_SV_ALL;
2777
2778   if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
2779     return AARCH64_NUM_SVPATTERNS;
2780
2781   if (vl >= 1 && vl <= 8)
2782     return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
2783
2784   if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
2785     return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
2786
2787   int max_vl;
2788   if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
2789     {
2790       if (vl == (max_vl / 3) * 3)
2791         return AARCH64_SV_MUL3;
2792       /* These would only trigger for non-power-of-2 lengths.  */
2793       if (vl == (max_vl & -4))
2794         return AARCH64_SV_MUL4;
2795       if (vl == (1 << floor_log2 (max_vl)))
2796         return AARCH64_SV_POW2;
2797       if (vl == max_vl)
2798         return AARCH64_SV_ALL;
2799     }
2800   return AARCH64_NUM_SVPATTERNS;
2801 }
2802
2803 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
2804    bits has the lowest bit set and the upper bits clear.  This is the
2805    VNx16BImode equivalent of a PTRUE for controlling elements of
2806    ELT_SIZE bytes.  However, because the constant is VNx16BImode,
2807    all bits are significant, even the upper zeros.  */
2808
2809 rtx
2810 aarch64_ptrue_all (unsigned int elt_size)
2811 {
2812   rtx_vector_builder builder (VNx16BImode, elt_size, 1);
2813   builder.quick_push (const1_rtx);
2814   for (unsigned int i = 1; i < elt_size; ++i)
2815     builder.quick_push (const0_rtx);
2816   return builder.build ();
2817 }
2818
2819 /* Return an all-true predicate register of mode MODE.  */
2820
2821 rtx
2822 aarch64_ptrue_reg (machine_mode mode)
2823 {
2824   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2825   rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
2826   return gen_lowpart (mode, reg);
2827 }
2828
2829 /* Return an all-false predicate register of mode MODE.  */
2830
2831 rtx
2832 aarch64_pfalse_reg (machine_mode mode)
2833 {
2834   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2835   rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
2836   return gen_lowpart (mode, reg);
2837 }
2838
2839 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
2840    true, or alternatively if we know that the operation predicated by
2841    PRED1[0] is safe to perform whenever PRED2 is true.  PRED1[1] is a
2842    aarch64_sve_gp_strictness operand that describes the operation
2843    predicated by PRED1[0].  */
2844
2845 bool
2846 aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
2847 {
2848   machine_mode mode = GET_MODE (pred2);
2849   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2850               && mode == GET_MODE (pred1[0])
2851               && aarch64_sve_gp_strictness (pred1[1], SImode));
2852   return (pred1[0] == CONSTM1_RTX (mode)
2853           || INTVAL (pred1[1]) == SVE_RELAXED_GP
2854           || rtx_equal_p (pred1[0], pred2));
2855 }
2856
2857 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
2858    for it.  PRED2[0] is the predicate for the instruction whose result
2859    is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
2860    for it.  Return true if we can prove that the two predicates are
2861    equivalent for PTEST purposes; that is, if we can replace PRED2[0]
2862    with PRED1[0] without changing behavior.  */
2863
2864 bool
2865 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
2866 {
2867   machine_mode mode = GET_MODE (pred1[0]);
2868   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2869               && mode == GET_MODE (pred2[0])
2870               && aarch64_sve_ptrue_flag (pred1[1], SImode)
2871               && aarch64_sve_ptrue_flag (pred2[1], SImode));
2872
2873   bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
2874                    || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
2875   bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
2876                    || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
2877   return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
2878 }
2879
2880 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
2881    DATA_MODE, and return the result in a predicate of mode PRED_MODE.
2882    Use TARGET as the target register if nonnull and convenient.  */
2883
2884 static rtx
2885 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
2886                           machine_mode data_mode, rtx op1, rtx op2)
2887 {
2888   insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
2889   expand_operand ops[5];
2890   create_output_operand (&ops[0], target, pred_mode);
2891   create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
2892   create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
2893   create_input_operand (&ops[3], op1, data_mode);
2894   create_input_operand (&ops[4], op2, data_mode);
2895   expand_insn (icode, 5, ops);
2896   return ops[0].value;
2897 }
2898
2899 /* Use a comparison to convert integer vector SRC into MODE, which is
2900    the corresponding SVE predicate mode.  Use TARGET for the result
2901    if it's nonnull and convenient.  */
2902
2903 static rtx
2904 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
2905 {
2906   machine_mode src_mode = GET_MODE (src);
2907   return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
2908                                    src, CONST0_RTX (src_mode));
2909 }
2910
2911 /* Return true if we can move VALUE into a register using a single
2912    CNT[BHWD] instruction.  */
2913
2914 static bool
2915 aarch64_sve_cnt_immediate_p (poly_int64 value)
2916 {
2917   HOST_WIDE_INT factor = value.coeffs[0];
2918   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2919   return (value.coeffs[1] == factor
2920           && IN_RANGE (factor, 2, 16 * 16)
2921           && (factor & 1) == 0
2922           && factor <= 16 * (factor & -factor));
2923 }
2924
2925 /* Likewise for rtx X.  */
2926
2927 bool
2928 aarch64_sve_cnt_immediate_p (rtx x)
2929 {
2930   poly_int64 value;
2931   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2932 }
2933
2934 /* Return the asm string for an instruction with a CNT-like vector size
2935    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2936    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2937    first part of the operands template (the part that comes before the
2938    vector size itself).  PATTERN is the pattern to use.  FACTOR is the
2939    number of quadwords.  NELTS_PER_VQ, if nonzero, is the number of elements
2940    in each quadword.  If it is zero, we can use any element size.  */
2941
2942 static char *
2943 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2944                                   aarch64_svpattern pattern,
2945                                   unsigned int factor,
2946                                   unsigned int nelts_per_vq)
2947 {
2948   static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
2949
2950   if (nelts_per_vq == 0)
2951     /* There is some overlap in the ranges of the four CNT instructions.
2952        Here we always use the smallest possible element size, so that the
2953        multiplier is 1 whereever possible.  */
2954     nelts_per_vq = factor & -factor;
2955   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2956   gcc_assert (IN_RANGE (shift, 1, 4));
2957   char suffix = "dwhb"[shift - 1];
2958
2959   factor >>= shift;
2960   unsigned int written;
2961   if (pattern == AARCH64_SV_ALL && factor == 1)
2962     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2963                         prefix, suffix, operands);
2964   else if (factor == 1)
2965     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
2966                         prefix, suffix, operands, svpattern_token (pattern));
2967   else
2968     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
2969                         prefix, suffix, operands, svpattern_token (pattern),
2970                         factor);
2971   gcc_assert (written < sizeof (buffer));
2972   return buffer;
2973 }
2974
2975 /* Return the asm string for an instruction with a CNT-like vector size
2976    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2977    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2978    first part of the operands template (the part that comes before the
2979    vector size itself).  X is the value of the vector size operand,
2980    as a polynomial integer rtx; we need to convert this into an "all"
2981    pattern with a multiplier.  */
2982
2983 char *
2984 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2985                                   rtx x)
2986 {
2987   poly_int64 value = rtx_to_poly_int64 (x);
2988   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2989   return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
2990                                            value.coeffs[1], 0);
2991 }
2992
2993 /* Return true if we can add X using a single SVE INC or DEC instruction.  */
2994
2995 bool
2996 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
2997 {
2998   poly_int64 value;
2999   return (poly_int_rtx_p (x, &value)
3000           && (aarch64_sve_cnt_immediate_p (value)
3001               || aarch64_sve_cnt_immediate_p (-value)));
3002 }
3003
3004 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
3005    operand 0.  */
3006
3007 char *
3008 aarch64_output_sve_scalar_inc_dec (rtx offset)
3009 {
3010   poly_int64 offset_value = rtx_to_poly_int64 (offset);
3011   gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
3012   if (offset_value.coeffs[1] > 0)
3013     return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
3014                                              offset_value.coeffs[1], 0);
3015   else
3016     return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
3017                                              -offset_value.coeffs[1], 0);
3018 }
3019
3020 /* Return true if we can add VALUE to a register using a single ADDVL
3021    or ADDPL instruction.  */
3022
3023 static bool
3024 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
3025 {
3026   HOST_WIDE_INT factor = value.coeffs[0];
3027   if (factor == 0 || value.coeffs[1] != factor)
3028     return false;
3029   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
3030      and a value of 16 is one vector width.  */
3031   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
3032           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
3033 }
3034
3035 /* Likewise for rtx X.  */
3036
3037 bool
3038 aarch64_sve_addvl_addpl_immediate_p (rtx x)
3039 {
3040   poly_int64 value;
3041   return (poly_int_rtx_p (x, &value)
3042           && aarch64_sve_addvl_addpl_immediate_p (value));
3043 }
3044
3045 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3046    to operand 1 and storing the result in operand 0.  */
3047
3048 char *
3049 aarch64_output_sve_addvl_addpl (rtx offset)
3050 {
3051   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3052   poly_int64 offset_value = rtx_to_poly_int64 (offset);
3053   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
3054
3055   int factor = offset_value.coeffs[1];
3056   if ((factor & 15) == 0)
3057     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
3058   else
3059     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
3060   return buffer;
3061 }
3062
3063 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3064    instruction.  If it is, store the number of elements in each vector
3065    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3066    factor in *FACTOR_OUT (if nonnull).  */
3067
3068 bool
3069 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
3070                                         unsigned int *nelts_per_vq_out)
3071 {
3072   rtx elt;
3073   poly_int64 value;
3074
3075   if (!const_vec_duplicate_p (x, &elt)
3076       || !poly_int_rtx_p (elt, &value))
3077     return false;
3078
3079   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
3080   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
3081     /* There's no vector INCB.  */
3082     return false;
3083
3084   HOST_WIDE_INT factor = value.coeffs[0];
3085   if (value.coeffs[1] != factor)
3086     return false;
3087
3088   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
3089   if ((factor % nelts_per_vq) != 0
3090       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
3091     return false;
3092
3093   if (factor_out)
3094     *factor_out = factor;
3095   if (nelts_per_vq_out)
3096     *nelts_per_vq_out = nelts_per_vq;
3097   return true;
3098 }
3099
3100 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3101    instruction.  */
3102
3103 bool
3104 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
3105 {
3106   return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
3107 }
3108
3109 /* Return the asm template for an SVE vector INC or DEC instruction.
3110    OPERANDS gives the operands before the vector count and X is the
3111    value of the vector count operand itself.  */
3112
3113 char *
3114 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
3115 {
3116   int factor;
3117   unsigned int nelts_per_vq;
3118   if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
3119     gcc_unreachable ();
3120   if (factor < 0)
3121     return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
3122                                              -factor, nelts_per_vq);
3123   else
3124     return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
3125                                              factor, nelts_per_vq);
3126 }
3127
3128 static int
3129 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
3130                                 scalar_int_mode mode)
3131 {
3132   int i;
3133   unsigned HOST_WIDE_INT val, val2, mask;
3134   int one_match, zero_match;
3135   int num_insns;
3136
3137   val = INTVAL (imm);
3138
3139   if (aarch64_move_imm (val, mode))
3140     {
3141       if (generate)
3142         emit_insn (gen_rtx_SET (dest, imm));
3143       return 1;
3144     }
3145
3146   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3147      (with XXXX non-zero). In that case check to see if the move can be done in
3148      a smaller mode.  */
3149   val2 = val & 0xffffffff;
3150   if (mode == DImode
3151       && aarch64_move_imm (val2, SImode)
3152       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
3153     {
3154       if (generate)
3155         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3156
3157       /* Check if we have to emit a second instruction by checking to see
3158          if any of the upper 32 bits of the original DI mode value is set.  */
3159       if (val == val2)
3160         return 1;
3161
3162       i = (val >> 48) ? 48 : 32;
3163
3164       if (generate)
3165          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3166                                     GEN_INT ((val >> i) & 0xffff)));
3167
3168       return 2;
3169     }
3170
3171   if ((val >> 32) == 0 || mode == SImode)
3172     {
3173       if (generate)
3174         {
3175           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
3176           if (mode == SImode)
3177             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
3178                                        GEN_INT ((val >> 16) & 0xffff)));
3179           else
3180             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
3181                                        GEN_INT ((val >> 16) & 0xffff)));
3182         }
3183       return 2;
3184     }
3185
3186   /* Remaining cases are all for DImode.  */
3187
3188   mask = 0xffff;
3189   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
3190     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
3191   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
3192     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
3193
3194   if (zero_match != 2 && one_match != 2)
3195     {
3196       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3197          For a 64-bit bitmask try whether changing 16 bits to all ones or
3198          zeroes creates a valid bitmask.  To check any repeated bitmask,
3199          try using 16 bits from the other 32-bit half of val.  */
3200
3201       for (i = 0; i < 64; i += 16, mask <<= 16)
3202         {
3203           val2 = val & ~mask;
3204           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3205             break;
3206           val2 = val | mask;
3207           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3208             break;
3209           val2 = val2 & ~mask;
3210           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
3211           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3212             break;
3213         }
3214       if (i != 64)
3215         {
3216           if (generate)
3217             {
3218               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3219               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3220                                          GEN_INT ((val >> i) & 0xffff)));
3221             }
3222           return 2;
3223         }
3224     }
3225
3226   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3227      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
3228      otherwise skip zero bits.  */
3229
3230   num_insns = 1;
3231   mask = 0xffff;
3232   val2 = one_match > zero_match ? ~val : val;
3233   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
3234
3235   if (generate)
3236     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
3237                                            ? (val | ~(mask << i))
3238                                            : (val & (mask << i)))));
3239   for (i += 16; i < 64; i += 16)
3240     {
3241       if ((val2 & (mask << i)) == 0)
3242         continue;
3243       if (generate)
3244         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3245                                    GEN_INT ((val >> i) & 0xffff)));
3246       num_insns ++;
3247     }
3248
3249   return num_insns;
3250 }
3251
3252 /* Return whether imm is a 128-bit immediate which is simple enough to
3253    expand inline.  */
3254 bool
3255 aarch64_mov128_immediate (rtx imm)
3256 {
3257   if (GET_CODE (imm) == CONST_INT)
3258     return true;
3259
3260   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
3261
3262   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
3263   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
3264
3265   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
3266          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
3267 }
3268
3269
3270 /* Return the number of temporary registers that aarch64_add_offset_1
3271    would need to add OFFSET to a register.  */
3272
3273 static unsigned int
3274 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
3275 {
3276   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
3277 }
3278
3279 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
3280    a non-polynomial OFFSET.  MODE is the mode of the addition.
3281    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3282    be set and CFA adjustments added to the generated instructions.
3283
3284    TEMP1, if nonnull, is a register of mode MODE that can be used as a
3285    temporary if register allocation is already complete.  This temporary
3286    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
3287    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3288    the immediate again.
3289
3290    Since this function may be used to adjust the stack pointer, we must
3291    ensure that it cannot cause transient stack deallocation (for example
3292    by first incrementing SP and then decrementing when adjusting by a
3293    large immediate).  */
3294
3295 static void
3296 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
3297                       rtx src, HOST_WIDE_INT offset, rtx temp1,
3298                       bool frame_related_p, bool emit_move_imm)
3299 {
3300   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3301   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3302
3303   HOST_WIDE_INT moffset = abs_hwi (offset);
3304   rtx_insn *insn;
3305
3306   if (!moffset)
3307     {
3308       if (!rtx_equal_p (dest, src))
3309         {
3310           insn = emit_insn (gen_rtx_SET (dest, src));
3311           RTX_FRAME_RELATED_P (insn) = frame_related_p;
3312         }
3313       return;
3314     }
3315
3316   /* Single instruction adjustment.  */
3317   if (aarch64_uimm12_shift (moffset))
3318     {
3319       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
3320       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3321       return;
3322     }
3323
3324   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3325      and either:
3326
3327      a) the offset cannot be loaded by a 16-bit move or
3328      b) there is no spare register into which we can move it.  */
3329   if (moffset < 0x1000000
3330       && ((!temp1 && !can_create_pseudo_p ())
3331           || !aarch64_move_imm (moffset, mode)))
3332     {
3333       HOST_WIDE_INT low_off = moffset & 0xfff;
3334
3335       low_off = offset < 0 ? -low_off : low_off;
3336       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
3337       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3338       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
3339       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3340       return;
3341     }
3342
3343   /* Emit a move immediate if required and an addition/subtraction.  */
3344   if (emit_move_imm)
3345     {
3346       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
3347       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
3348     }
3349   insn = emit_insn (offset < 0
3350                     ? gen_sub3_insn (dest, src, temp1)
3351                     : gen_add3_insn (dest, src, temp1));
3352   if (frame_related_p)
3353     {
3354       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3355       rtx adj = plus_constant (mode, src, offset);
3356       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
3357     }
3358 }
3359
3360 /* Return the number of temporary registers that aarch64_add_offset
3361    would need to move OFFSET into a register or add OFFSET to a register;
3362    ADD_P is true if we want the latter rather than the former.  */
3363
3364 static unsigned int
3365 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
3366 {
3367   /* This follows the same structure as aarch64_add_offset.  */
3368   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
3369     return 0;
3370
3371   unsigned int count = 0;
3372   HOST_WIDE_INT factor = offset.coeffs[1];
3373   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3374   poly_int64 poly_offset (factor, factor);
3375   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3376     /* Need one register for the ADDVL/ADDPL result.  */
3377     count += 1;
3378   else if (factor != 0)
3379     {
3380       factor = abs (factor);
3381       if (factor > 16 * (factor & -factor))
3382         /* Need one register for the CNT result and one for the multiplication
3383            factor.  If necessary, the second temporary can be reused for the
3384            constant part of the offset.  */
3385         return 2;
3386       /* Need one register for the CNT result (which might then
3387          be shifted).  */
3388       count += 1;
3389     }
3390   return count + aarch64_add_offset_1_temporaries (constant);
3391 }
3392
3393 /* If X can be represented as a poly_int64, return the number
3394    of temporaries that are required to add it to a register.
3395    Return -1 otherwise.  */
3396
3397 int
3398 aarch64_add_offset_temporaries (rtx x)
3399 {
3400   poly_int64 offset;
3401   if (!poly_int_rtx_p (x, &offset))
3402     return -1;
3403   return aarch64_offset_temporaries (true, offset);
3404 }
3405
3406 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
3407    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3408    be set and CFA adjustments added to the generated instructions.
3409
3410    TEMP1, if nonnull, is a register of mode MODE that can be used as a
3411    temporary if register allocation is already complete.  This temporary
3412    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3413    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3414    false to avoid emitting the immediate again.
3415
3416    TEMP2, if nonnull, is a second temporary register that doesn't
3417    overlap either DEST or REG.
3418
3419    Since this function may be used to adjust the stack pointer, we must
3420    ensure that it cannot cause transient stack deallocation (for example
3421    by first incrementing SP and then decrementing when adjusting by a
3422    large immediate).  */
3423
3424 static void
3425 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3426                     poly_int64 offset, rtx temp1, rtx temp2,
3427                     bool frame_related_p, bool emit_move_imm = true)
3428 {
3429   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3430   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3431   gcc_assert (temp1 == NULL_RTX
3432               || !frame_related_p
3433               || !reg_overlap_mentioned_p (temp1, dest));
3434   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
3435
3436   /* Try using ADDVL or ADDPL to add the whole value.  */
3437   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
3438     {
3439       rtx offset_rtx = gen_int_mode (offset, mode);
3440       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3441       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3442       return;
3443     }
3444
3445   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3446      SVE vector register, over and above the minimum size of 128 bits.
3447      This is equivalent to half the value returned by CNTD with a
3448      vector shape of ALL.  */
3449   HOST_WIDE_INT factor = offset.coeffs[1];
3450   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3451
3452   /* Try using ADDVL or ADDPL to add the VG-based part.  */
3453   poly_int64 poly_offset (factor, factor);
3454   if (src != const0_rtx
3455       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3456     {
3457       rtx offset_rtx = gen_int_mode (poly_offset, mode);
3458       if (frame_related_p)
3459         {
3460           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3461           RTX_FRAME_RELATED_P (insn) = true;
3462           src = dest;
3463         }
3464       else
3465         {
3466           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
3467           src = aarch64_force_temporary (mode, temp1, addr);
3468           temp1 = temp2;
3469           temp2 = NULL_RTX;
3470         }
3471     }
3472   /* Otherwise use a CNT-based sequence.  */
3473   else if (factor != 0)
3474     {
3475       /* Use a subtraction if we have a negative factor.  */
3476       rtx_code code = PLUS;
3477       if (factor < 0)
3478         {
3479           factor = -factor;
3480           code = MINUS;
3481         }
3482
3483       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
3484          into the multiplication.  */
3485       rtx val;
3486       int shift = 0;
3487       if (factor & 1)
3488         /* Use a right shift by 1.  */
3489         shift = -1;
3490       else
3491         factor /= 2;
3492       HOST_WIDE_INT low_bit = factor & -factor;
3493       if (factor <= 16 * low_bit)
3494         {
3495           if (factor > 16 * 8)
3496             {
3497               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3498                  the value with the minimum multiplier and shift it into
3499                  position.  */
3500               int extra_shift = exact_log2 (low_bit);
3501               shift += extra_shift;
3502               factor >>= extra_shift;
3503             }
3504           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3505         }
3506       else
3507         {
3508           /* Base the factor on LOW_BIT if we can calculate LOW_BIT
3509              directly, since that should increase the chances of being
3510              able to use a shift and add sequence.  If LOW_BIT itself
3511              is out of range, just use CNTD.  */
3512           if (low_bit <= 16 * 8)
3513             factor /= low_bit;
3514           else
3515             low_bit = 1;
3516
3517           val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
3518           val = aarch64_force_temporary (mode, temp1, val);
3519
3520           if (can_create_pseudo_p ())
3521             {
3522               rtx coeff1 = gen_int_mode (factor, mode);
3523               val = expand_mult (mode, val, coeff1, NULL_RTX, false, true);
3524             }
3525           else
3526             {
3527               /* Go back to using a negative multiplication factor if we have
3528                  no register from which to subtract.  */
3529               if (code == MINUS && src == const0_rtx)
3530                 {
3531                   factor = -factor;
3532                   code = PLUS;
3533                 }
3534               rtx coeff1 = gen_int_mode (factor, mode);
3535               coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3536               val = gen_rtx_MULT (mode, val, coeff1);
3537             }
3538         }
3539
3540       if (shift > 0)
3541         {
3542           /* Multiply by 1 << SHIFT.  */
3543           val = aarch64_force_temporary (mode, temp1, val);
3544           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3545         }
3546       else if (shift == -1)
3547         {
3548           /* Divide by 2.  */
3549           val = aarch64_force_temporary (mode, temp1, val);
3550           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3551         }
3552
3553       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
3554       if (src != const0_rtx)
3555         {
3556           val = aarch64_force_temporary (mode, temp1, val);
3557           val = gen_rtx_fmt_ee (code, mode, src, val);
3558         }
3559       else if (code == MINUS)
3560         {
3561           val = aarch64_force_temporary (mode, temp1, val);
3562           val = gen_rtx_NEG (mode, val);
3563         }
3564
3565       if (constant == 0 || frame_related_p)
3566         {
3567           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3568           if (frame_related_p)
3569             {
3570               RTX_FRAME_RELATED_P (insn) = true;
3571               add_reg_note (insn, REG_CFA_ADJUST_CFA,
3572                             gen_rtx_SET (dest, plus_constant (Pmode, src,
3573                                                               poly_offset)));
3574             }
3575           src = dest;
3576           if (constant == 0)
3577             return;
3578         }
3579       else
3580         {
3581           src = aarch64_force_temporary (mode, temp1, val);
3582           temp1 = temp2;
3583           temp2 = NULL_RTX;
3584         }
3585
3586       emit_move_imm = true;
3587     }
3588
3589   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3590                         frame_related_p, emit_move_imm);
3591 }
3592
3593 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3594    than a poly_int64.  */
3595
3596 void
3597 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3598                           rtx offset_rtx, rtx temp1, rtx temp2)
3599 {
3600   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3601                       temp1, temp2, false);
3602 }
3603
3604 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3605    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
3606    if TEMP1 already contains abs (DELTA).  */
3607
3608 static inline void
3609 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3610 {
3611   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3612                       temp1, temp2, true, emit_move_imm);
3613 }
3614
3615 /* Subtract DELTA from the stack pointer, marking the instructions
3616    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
3617    if nonnull.  */
3618
3619 static inline void
3620 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3621                 bool emit_move_imm = true)
3622 {
3623   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3624                       temp1, temp2, frame_related_p, emit_move_imm);
3625 }
3626
3627 /* Set DEST to (vec_series BASE STEP).  */
3628
3629 static void
3630 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3631 {
3632   machine_mode mode = GET_MODE (dest);
3633   scalar_mode inner = GET_MODE_INNER (mode);
3634
3635   /* Each operand can be a register or an immediate in the range [-16, 15].  */
3636   if (!aarch64_sve_index_immediate_p (base))
3637     base = force_reg (inner, base);
3638   if (!aarch64_sve_index_immediate_p (step))
3639     step = force_reg (inner, step);
3640
3641   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3642 }
3643
3644 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
3645    register of mode MODE.  Use TARGET for the result if it's nonnull
3646    and convenient.
3647
3648    The two vector modes must have the same element mode.  The behavior
3649    is to duplicate architectural lane N of SRC into architectural lanes
3650    N + I * STEP of the result.  On big-endian targets, architectural
3651    lane 0 of an Advanced SIMD vector is the last element of the vector
3652    in memory layout, so for big-endian targets this operation has the
3653    effect of reversing SRC before duplicating it.  Callers need to
3654    account for this.  */
3655
3656 rtx
3657 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
3658 {
3659   machine_mode src_mode = GET_MODE (src);
3660   gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
3661   insn_code icode = (BYTES_BIG_ENDIAN
3662                      ? code_for_aarch64_vec_duplicate_vq_be (mode)
3663                      : code_for_aarch64_vec_duplicate_vq_le (mode));
3664
3665   unsigned int i = 0;
3666   expand_operand ops[3];
3667   create_output_operand (&ops[i++], target, mode);
3668   create_output_operand (&ops[i++], src, src_mode);
3669   if (BYTES_BIG_ENDIAN)
3670     {
3671       /* Create a PARALLEL describing the reversal of SRC.  */
3672       unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
3673       rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
3674                                                   nelts_per_vq - 1, -1);
3675       create_fixed_operand (&ops[i++], sel);
3676     }
3677   expand_insn (icode, i, ops);
3678   return ops[0].value;
3679 }
3680
3681 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
3682    the memory image into DEST.  Return true on success.  */
3683
3684 static bool
3685 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
3686 {
3687   src = force_const_mem (GET_MODE (src), src);
3688   if (!src)
3689     return false;
3690
3691   /* Make sure that the address is legitimate.  */
3692   if (!aarch64_sve_ld1rq_operand_p (src))
3693     {
3694       rtx addr = force_reg (Pmode, XEXP (src, 0));
3695       src = replace_equiv_address (src, addr);
3696     }
3697
3698   machine_mode mode = GET_MODE (dest);
3699   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3700   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3701   rtx ptrue = aarch64_ptrue_reg (pred_mode);
3702   emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
3703   return true;
3704 }
3705
3706 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
3707    SVE data mode and isn't a legitimate constant.  Use TARGET for the
3708    result if convenient.
3709
3710    The returned register can have whatever mode seems most natural
3711    given the contents of SRC.  */
3712
3713 static rtx
3714 aarch64_expand_sve_const_vector (rtx target, rtx src)
3715 {
3716   machine_mode mode = GET_MODE (src);
3717   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3718   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3719   scalar_mode elt_mode = GET_MODE_INNER (mode);
3720   unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
3721   unsigned int encoded_bits = npatterns * nelts_per_pattern * elt_bits;
3722
3723   if (nelts_per_pattern == 1 && encoded_bits == 128)
3724     {
3725       /* The constant is a duplicated quadword but can't be narrowed
3726          beyond a quadword.  Get the memory image of the first quadword
3727          as a 128-bit vector and try using LD1RQ to load it from memory.
3728
3729          The effect for both endiannesses is to load memory lane N into
3730          architectural lanes N + I * STEP of the result.  On big-endian
3731          targets, the layout of the 128-bit vector in an Advanced SIMD
3732          register would be different from its layout in an SVE register,
3733          but this 128-bit vector is a memory value only.  */
3734       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3735       rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
3736       if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
3737         return target;
3738     }
3739
3740   if (nelts_per_pattern == 1 && encoded_bits < 128)
3741     {
3742       /* The vector is a repeating sequence of 64 bits or fewer.
3743          See if we can load them using an Advanced SIMD move and then
3744          duplicate it to fill a vector.  This is better than using a GPR
3745          move because it keeps everything in the same register file.  */
3746       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3747       rtx_vector_builder builder (vq_mode, npatterns, 1);
3748       for (unsigned int i = 0; i < npatterns; ++i)
3749         {
3750           /* We want memory lane N to go into architectural lane N,
3751              so reverse for big-endian targets.  The DUP .Q pattern
3752              has a compensating reverse built-in.  */
3753           unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
3754           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
3755         }
3756       rtx vq_src = builder.build ();
3757       if (aarch64_simd_valid_immediate (vq_src, NULL))
3758         {
3759           vq_src = force_reg (vq_mode, vq_src);
3760           return aarch64_expand_sve_dupq (target, mode, vq_src);
3761         }
3762
3763       /* Get an integer representation of the repeating part of Advanced
3764          SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
3765          which for big-endian targets is lane-swapped wrt a normal
3766          Advanced SIMD vector.  This means that for both endiannesses,
3767          memory lane N of SVE vector SRC corresponds to architectural
3768          lane N of a register holding VQ_SRC.  This in turn means that
3769          memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
3770          as a single 128-bit value) and thus that memory lane 0 of SRC is
3771          in the lsb of the integer.  Duplicating the integer therefore
3772          ensures that memory lane N of SRC goes into architectural lane
3773          N + I * INDEX of the SVE register.  */
3774       scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
3775       rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
3776       if (elt_value)
3777         {
3778           /* Pretend that we had a vector of INT_MODE to start with.  */
3779           elt_mode = int_mode;
3780           mode = aarch64_full_sve_mode (int_mode).require ();
3781
3782           /* If the integer can be moved into a general register by a
3783              single instruction, do that and duplicate the result.  */
3784           if (CONST_INT_P (elt_value)
3785               && aarch64_move_imm (INTVAL (elt_value), elt_mode))
3786             {
3787               elt_value = force_reg (elt_mode, elt_value);
3788               return expand_vector_broadcast (mode, elt_value);
3789             }
3790         }
3791       else if (npatterns == 1)
3792         /* We're duplicating a single value, but can't do better than
3793            force it to memory and load from there.  This handles things
3794            like symbolic constants.  */
3795         elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
3796
3797       if (elt_value)
3798         {
3799           /* Load the element from memory if we can, otherwise move it into
3800              a register and use a DUP.  */
3801           rtx op = force_const_mem (elt_mode, elt_value);
3802           if (!op)
3803             op = force_reg (elt_mode, elt_value);
3804           return expand_vector_broadcast (mode, op);
3805         }
3806     }
3807
3808   /* Try using INDEX.  */
3809   rtx base, step;
3810   if (const_vec_series_p (src, &base, &step))
3811     {
3812       aarch64_expand_vec_series (target, base, step);
3813       return target;
3814     }
3815
3816   /* From here on, it's better to force the whole constant to memory
3817      if we can.  */
3818   if (GET_MODE_NUNITS (mode).is_constant ())
3819     return NULL_RTX;
3820
3821   /* Expand each pattern individually.  */
3822   gcc_assert (npatterns > 1);
3823   rtx_vector_builder builder;
3824   auto_vec<rtx, 16> vectors (npatterns);
3825   for (unsigned int i = 0; i < npatterns; ++i)
3826     {
3827       builder.new_vector (mode, 1, nelts_per_pattern);
3828       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3829         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3830       vectors.quick_push (force_reg (mode, builder.build ()));
3831     }
3832
3833   /* Use permutes to interleave the separate vectors.  */
3834   while (npatterns > 1)
3835     {
3836       npatterns /= 2;
3837       for (unsigned int i = 0; i < npatterns; ++i)
3838         {
3839           rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
3840           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3841           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3842           vectors[i] = tmp;
3843         }
3844     }
3845   gcc_assert (vectors[0] == target);
3846   return target;
3847 }
3848
3849 /* Use WHILE to set a predicate register of mode MODE in which the first
3850    VL bits are set and the rest are clear.  Use TARGET for the register
3851    if it's nonnull and convenient.  */
3852
3853 static rtx
3854 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
3855                                  unsigned int vl)
3856 {
3857   rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
3858   target = aarch64_target_reg (target, mode);
3859   emit_insn (gen_while_ult (DImode, mode, target, const0_rtx, limit));
3860   return target;
3861 }
3862
3863 static rtx
3864 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
3865
3866 /* BUILDER is a constant predicate in which the index of every set bit
3867    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
3868    by inverting every element at a multiple of ELT_SIZE and EORing the
3869    result with an ELT_SIZE PTRUE.
3870
3871    Return a register that contains the constant on success, otherwise
3872    return null.  Use TARGET as the register if it is nonnull and
3873    convenient.  */
3874
3875 static rtx
3876 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
3877                                    unsigned int elt_size)
3878 {
3879   /* Invert every element at a multiple of ELT_SIZE, keeping the
3880      other bits zero.  */
3881   rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
3882                                   builder.nelts_per_pattern ());
3883   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
3884     if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
3885       inv_builder.quick_push (const1_rtx);
3886     else
3887       inv_builder.quick_push (const0_rtx);
3888   inv_builder.finalize ();
3889
3890   /* See if we can load the constant cheaply.  */
3891   rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
3892   if (!inv)
3893     return NULL_RTX;
3894
3895   /* EOR the result with an ELT_SIZE PTRUE.  */
3896   rtx mask = aarch64_ptrue_all (elt_size);
3897   mask = force_reg (VNx16BImode, mask);
3898   target = aarch64_target_reg (target, VNx16BImode);
3899   emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
3900   return target;
3901 }
3902
3903 /* BUILDER is a constant predicate in which the index of every set bit
3904    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
3905    using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE.  Return the
3906    register on success, otherwise return null.  Use TARGET as the register
3907    if nonnull and convenient.  */
3908
3909 static rtx
3910 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
3911                                    unsigned int elt_size,
3912                                    unsigned int permute_size)
3913 {
3914   /* We're going to split the constant into two new constants A and B,
3915      with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
3916      and into B otherwise.  E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
3917
3918      A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
3919      B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
3920
3921      where _ indicates elements that will be discarded by the permute.
3922
3923      First calculate the ELT_SIZEs for A and B.  */
3924   unsigned int a_elt_size = GET_MODE_SIZE (DImode);
3925   unsigned int b_elt_size = GET_MODE_SIZE (DImode);
3926   for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
3927     if (INTVAL (builder.elt (i)) != 0)
3928       {
3929         if (i & permute_size)
3930           b_elt_size |= i - permute_size;
3931         else
3932           a_elt_size |= i;
3933       }
3934   a_elt_size &= -a_elt_size;
3935   b_elt_size &= -b_elt_size;
3936
3937   /* Now construct the vectors themselves.  */
3938   rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
3939                                 builder.nelts_per_pattern ());
3940   rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
3941                                 builder.nelts_per_pattern ());
3942   unsigned int nelts = builder.encoded_nelts ();
3943   for (unsigned int i = 0; i < nelts; ++i)
3944     if (i & (elt_size - 1))
3945       {
3946         a_builder.quick_push (const0_rtx);
3947         b_builder.quick_push (const0_rtx);
3948       }
3949     else if ((i & permute_size) == 0)
3950       {
3951         /* The A and B elements are significant.  */
3952         a_builder.quick_push (builder.elt (i));
3953         b_builder.quick_push (builder.elt (i + permute_size));
3954       }
3955     else
3956       {
3957         /* The A and B elements are going to be discarded, so pick whatever
3958            is likely to give a nice constant.  We are targeting element
3959            sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
3960            with the aim of each being a sequence of ones followed by
3961            a sequence of zeros.  So:
3962
3963            * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
3964              duplicate the last X_ELT_SIZE element, to extend the
3965              current sequence of ones or zeros.
3966
3967            * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
3968              zero, so that the constant really does have X_ELT_SIZE and
3969              not a smaller size.  */
3970         if (a_elt_size > permute_size)
3971           a_builder.quick_push (const0_rtx);
3972         else
3973           a_builder.quick_push (a_builder.elt (i - a_elt_size));
3974         if (b_elt_size > permute_size)
3975           b_builder.quick_push (const0_rtx);
3976         else
3977           b_builder.quick_push (b_builder.elt (i - b_elt_size));
3978       }
3979   a_builder.finalize ();
3980   b_builder.finalize ();
3981
3982   /* Try loading A into a register.  */
3983   rtx_insn *last = get_last_insn ();
3984   rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
3985   if (!a)
3986     return NULL_RTX;
3987
3988   /* Try loading B into a register.  */
3989   rtx b = a;
3990   if (a_builder != b_builder)
3991     {
3992       b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
3993       if (!b)
3994         {
3995           delete_insns_since (last);
3996           return NULL_RTX;
3997         }
3998     }
3999
4000   /* Emit the TRN1 itself.  */
4001   machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
4002   target = aarch64_target_reg (target, mode);
4003   emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
4004                               gen_lowpart (mode, a),
4005                               gen_lowpart (mode, b)));
4006   return target;
4007 }
4008
4009 /* Subroutine of aarch64_expand_sve_const_pred.  Try to load the VNx16BI
4010    constant in BUILDER into an SVE predicate register.  Return the register
4011    on success, otherwise return null.  Use TARGET for the register if
4012    nonnull and convenient.
4013
4014    ALLOW_RECURSE_P is true if we can use methods that would call this
4015    function recursively.  */
4016
4017 static rtx
4018 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
4019                                  bool allow_recurse_p)
4020 {
4021   if (builder.encoded_nelts () == 1)
4022     /* A PFALSE or a PTRUE .B ALL.  */
4023     return aarch64_emit_set_immediate (target, builder);
4024
4025   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
4026   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
4027     {
4028       /* If we can load the constant using PTRUE, use it as-is.  */
4029       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
4030       if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
4031         return aarch64_emit_set_immediate (target, builder);
4032
4033       /* Otherwise use WHILE to set the first VL bits.  */
4034       return aarch64_sve_move_pred_via_while (target, mode, vl);
4035     }
4036
4037   if (!allow_recurse_p)
4038     return NULL_RTX;
4039
4040   /* Try inverting the vector in element size ELT_SIZE and then EORing
4041      the result with an ELT_SIZE PTRUE.  */
4042   if (INTVAL (builder.elt (0)) == 0)
4043     if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
4044                                                      elt_size))
4045       return res;
4046
4047   /* Try using TRN1 to permute two simpler constants.  */
4048   for (unsigned int i = elt_size; i <= 8; i *= 2)
4049     if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
4050                                                      elt_size, i))
4051       return res;
4052
4053   return NULL_RTX;
4054 }
4055
4056 /* Return an SVE predicate register that contains the VNx16BImode
4057    constant in BUILDER, without going through the move expanders.
4058
4059    The returned register can have whatever mode seems most natural
4060    given the contents of BUILDER.  Use TARGET for the result if
4061    convenient.  */
4062
4063 static rtx
4064 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
4065 {
4066   /* Try loading the constant using pure predicate operations.  */
4067   if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
4068     return res;
4069
4070   /* Try forcing the constant to memory.  */
4071   if (builder.full_nelts ().is_constant ())
4072     if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
4073       {
4074         target = aarch64_target_reg (target, VNx16BImode);
4075         emit_move_insn (target, mem);
4076         return target;
4077       }
4078
4079   /* The last resort is to load the constant as an integer and then
4080      compare it against zero.  Use -1 for set bits in order to increase
4081      the changes of using SVE DUPM or an Advanced SIMD byte mask.  */
4082   rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
4083                                   builder.nelts_per_pattern ());
4084   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4085     int_builder.quick_push (INTVAL (builder.elt (i))
4086                             ? constm1_rtx : const0_rtx);
4087   return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
4088                                            int_builder.build ());
4089 }
4090
4091 /* Set DEST to immediate IMM.  */
4092
4093 void
4094 aarch64_expand_mov_immediate (rtx dest, rtx imm)
4095 {
4096   machine_mode mode = GET_MODE (dest);
4097
4098   /* Check on what type of symbol it is.  */
4099   scalar_int_mode int_mode;
4100   if ((GET_CODE (imm) == SYMBOL_REF
4101        || GET_CODE (imm) == LABEL_REF
4102        || GET_CODE (imm) == CONST
4103        || GET_CODE (imm) == CONST_POLY_INT)
4104       && is_a <scalar_int_mode> (mode, &int_mode))
4105     {
4106       rtx mem;
4107       poly_int64 offset;
4108       HOST_WIDE_INT const_offset;
4109       enum aarch64_symbol_type sty;
4110
4111       /* If we have (const (plus symbol offset)), separate out the offset
4112          before we start classifying the symbol.  */
4113       rtx base = strip_offset (imm, &offset);
4114
4115       /* We must always add an offset involving VL separately, rather than
4116          folding it into the relocation.  */
4117       if (!offset.is_constant (&const_offset))
4118         {
4119           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
4120             emit_insn (gen_rtx_SET (dest, imm));
4121           else
4122             {
4123               /* Do arithmetic on 32-bit values if the result is smaller
4124                  than that.  */
4125               if (partial_subreg_p (int_mode, SImode))
4126                 {
4127                   /* It is invalid to do symbol calculations in modes
4128                      narrower than SImode.  */
4129                   gcc_assert (base == const0_rtx);
4130                   dest = gen_lowpart (SImode, dest);
4131                   int_mode = SImode;
4132                 }
4133               if (base != const0_rtx)
4134                 {
4135                   base = aarch64_force_temporary (int_mode, dest, base);
4136                   aarch64_add_offset (int_mode, dest, base, offset,
4137                                       NULL_RTX, NULL_RTX, false);
4138                 }
4139               else
4140                 aarch64_add_offset (int_mode, dest, base, offset,
4141                                     dest, NULL_RTX, false);
4142             }
4143           return;
4144         }
4145
4146       sty = aarch64_classify_symbol (base, const_offset);
4147       switch (sty)
4148         {
4149         case SYMBOL_FORCE_TO_MEM:
4150           if (const_offset != 0
4151               && targetm.cannot_force_const_mem (int_mode, imm))
4152             {
4153               gcc_assert (can_create_pseudo_p ());
4154               base = aarch64_force_temporary (int_mode, dest, base);
4155               aarch64_add_offset (int_mode, dest, base, const_offset,
4156                                   NULL_RTX, NULL_RTX, false);
4157               return;
4158             }
4159
4160           mem = force_const_mem (ptr_mode, imm);
4161           gcc_assert (mem);
4162
4163           /* If we aren't generating PC relative literals, then
4164              we need to expand the literal pool access carefully.
4165              This is something that needs to be done in a number
4166              of places, so could well live as a separate function.  */
4167           if (!aarch64_pcrelative_literal_loads)
4168             {
4169               gcc_assert (can_create_pseudo_p ());
4170               base = gen_reg_rtx (ptr_mode);
4171               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
4172               if (ptr_mode != Pmode)
4173                 base = convert_memory_address (Pmode, base);
4174               mem = gen_rtx_MEM (ptr_mode, base);
4175             }
4176
4177           if (int_mode != ptr_mode)
4178             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
4179
4180           emit_insn (gen_rtx_SET (dest, mem));
4181
4182           return;
4183
4184         case SYMBOL_SMALL_TLSGD:
4185         case SYMBOL_SMALL_TLSDESC:
4186         case SYMBOL_SMALL_TLSIE:
4187         case SYMBOL_SMALL_GOT_28K:
4188         case SYMBOL_SMALL_GOT_4G:
4189         case SYMBOL_TINY_GOT:
4190         case SYMBOL_TINY_TLSIE:
4191           if (const_offset != 0)
4192             {
4193               gcc_assert(can_create_pseudo_p ());
4194               base = aarch64_force_temporary (int_mode, dest, base);
4195               aarch64_add_offset (int_mode, dest, base, const_offset,
4196                                   NULL_RTX, NULL_RTX, false);
4197               return;
4198             }
4199           /* FALLTHRU */
4200
4201         case SYMBOL_SMALL_ABSOLUTE:
4202         case SYMBOL_TINY_ABSOLUTE:
4203         case SYMBOL_TLSLE12:
4204         case SYMBOL_TLSLE24:
4205         case SYMBOL_TLSLE32:
4206         case SYMBOL_TLSLE48:
4207           aarch64_load_symref_appropriately (dest, imm, sty);
4208           return;
4209
4210         default:
4211           gcc_unreachable ();
4212         }
4213     }
4214
4215   if (!CONST_INT_P (imm))
4216     {
4217       if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
4218         {
4219           /* Only the low bit of each .H, .S and .D element is defined,
4220              so we can set the upper bits to whatever we like.  If the
4221              predicate is all-true in MODE, prefer to set all the undefined
4222              bits as well, so that we can share a single .B predicate for
4223              all modes.  */
4224           if (imm == CONSTM1_RTX (mode))
4225             imm = CONSTM1_RTX (VNx16BImode);
4226
4227           /* All methods for constructing predicate modes wider than VNx16BI
4228              will set the upper bits of each element to zero.  Expose this
4229              by moving such constants as a VNx16BI, so that all bits are
4230              significant and so that constants for different modes can be
4231              shared.  The wider constant will still be available as a
4232              REG_EQUAL note.  */
4233           rtx_vector_builder builder;
4234           if (aarch64_get_sve_pred_bits (builder, imm))
4235             {
4236               rtx res = aarch64_expand_sve_const_pred (dest, builder);
4237               if (dest != res)
4238                 emit_move_insn (dest, gen_lowpart (mode, res));
4239               return;
4240             }
4241         }
4242
4243       if (GET_CODE (imm) == HIGH
4244           || aarch64_simd_valid_immediate (imm, NULL))
4245         {
4246           emit_insn (gen_rtx_SET (dest, imm));
4247           return;
4248         }
4249
4250       if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
4251         if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
4252           {
4253             if (dest != res)
4254               emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
4255             return;
4256           }
4257
4258       rtx mem = force_const_mem (mode, imm);
4259       gcc_assert (mem);
4260       emit_move_insn (dest, mem);
4261       return;
4262     }
4263
4264   aarch64_internal_mov_immediate (dest, imm, true,
4265                                   as_a <scalar_int_mode> (mode));
4266 }
4267
4268 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
4269    that is known to contain PTRUE.  */
4270
4271 void
4272 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
4273 {
4274   expand_operand ops[3];
4275   machine_mode mode = GET_MODE (dest);
4276   create_output_operand (&ops[0], dest, mode);
4277   create_input_operand (&ops[1], pred, GET_MODE(pred));
4278   create_input_operand (&ops[2], src, mode);
4279   temporary_volatile_ok v (true);
4280   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
4281 }
4282
4283 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
4284    operand is in memory.  In this case we need to use the predicated LD1
4285    and ST1 instead of LDR and STR, both for correctness on big-endian
4286    targets and because LD1 and ST1 support a wider range of addressing modes.
4287    PRED_MODE is the mode of the predicate.
4288
4289    See the comment at the head of aarch64-sve.md for details about the
4290    big-endian handling.  */
4291
4292 void
4293 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
4294 {
4295   machine_mode mode = GET_MODE (dest);
4296   rtx ptrue = aarch64_ptrue_reg (pred_mode);
4297   if (!register_operand (src, mode)
4298       && !register_operand (dest, mode))
4299     {
4300       rtx tmp = gen_reg_rtx (mode);
4301       if (MEM_P (src))
4302         aarch64_emit_sve_pred_move (tmp, ptrue, src);
4303       else
4304         emit_move_insn (tmp, src);
4305       src = tmp;
4306     }
4307   aarch64_emit_sve_pred_move (dest, ptrue, src);
4308 }
4309
4310 /* Called only on big-endian targets.  See whether an SVE vector move
4311    from SRC to DEST is effectively a REV[BHW] instruction, because at
4312    least one operand is a subreg of an SVE vector that has wider or
4313    narrower elements.  Return true and emit the instruction if so.
4314
4315    For example:
4316
4317      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4318
4319    represents a VIEW_CONVERT between the following vectors, viewed
4320    in memory order:
4321
4322      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
4323      R1: { [0],      [1],      [2],      [3],     ... }
4324
4325    The high part of lane X in R2 should therefore correspond to lane X*2
4326    of R1, but the register representations are:
4327
4328          msb                                      lsb
4329      R2: ...... [1].high  [1].low   [0].high  [0].low
4330      R1: ...... [3]       [2]       [1]       [0]
4331
4332    where the low part of lane X in R2 corresponds to lane X*2 in R1.
4333    We therefore need a reverse operation to swap the high and low values
4334    around.
4335
4336    This is purely an optimization.  Without it we would spill the
4337    subreg operand to the stack in one mode and reload it in the
4338    other mode, which has the same effect as the REV.  */
4339
4340 bool
4341 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
4342 {
4343   gcc_assert (BYTES_BIG_ENDIAN);
4344   if (GET_CODE (dest) == SUBREG)
4345     dest = SUBREG_REG (dest);
4346   if (GET_CODE (src) == SUBREG)
4347     src = SUBREG_REG (src);
4348
4349   /* The optimization handles two single SVE REGs with different element
4350      sizes.  */
4351   if (!REG_P (dest)
4352       || !REG_P (src)
4353       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
4354       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
4355       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
4356           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
4357     return false;
4358
4359   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
4360   rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
4361   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
4362                                UNSPEC_REV_SUBREG);
4363   emit_insn (gen_rtx_SET (dest, unspec));
4364   return true;
4365 }
4366
4367 /* Return a copy of X with mode MODE, without changing its other
4368    attributes.  Unlike gen_lowpart, this doesn't care whether the
4369    mode change is valid.  */
4370
4371 static rtx
4372 aarch64_replace_reg_mode (rtx x, machine_mode mode)
4373 {
4374   if (GET_MODE (x) == mode)
4375     return x;
4376
4377   x = shallow_copy_rtx (x);
4378   set_mode_and_regno (x, mode, REGNO (x));
4379   return x;
4380 }
4381
4382 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
4383    stored in wider integer containers.  */
4384
4385 static unsigned int
4386 aarch64_sve_rev_unspec (machine_mode mode)
4387 {
4388   switch (GET_MODE_UNIT_SIZE (mode))
4389     {
4390     case 1: return UNSPEC_REVB;
4391     case 2: return UNSPEC_REVH;
4392     case 4: return UNSPEC_REVW;
4393     }
4394   gcc_unreachable ();
4395 }
4396
4397 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4398    operands.  */
4399
4400 void
4401 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
4402 {
4403   /* Decide which REV operation we need.  The mode with wider elements
4404      determines the mode of the operands and the mode with the narrower
4405      elements determines the reverse width.  */
4406   machine_mode mode_with_wider_elts = GET_MODE (dest);
4407   machine_mode mode_with_narrower_elts = GET_MODE (src);
4408   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
4409       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
4410     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
4411
4412   unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
4413   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
4414   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
4415
4416   /* Get the operands in the appropriate modes and emit the instruction.  */
4417   ptrue = gen_lowpart (pred_mode, ptrue);
4418   dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
4419   src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
4420   emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
4421                                dest, ptrue, src));
4422 }
4423
4424 static bool
4425 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
4426                                  tree exp ATTRIBUTE_UNUSED)
4427 {
4428   if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
4429     return false;
4430
4431   return true;
4432 }
4433
4434 /* Implement TARGET_PASS_BY_REFERENCE.  */
4435
4436 static bool
4437 aarch64_pass_by_reference (cumulative_args_t, const function_arg_info &arg)
4438 {
4439   HOST_WIDE_INT size;
4440   machine_mode dummymode;
4441   int nregs;
4442
4443   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
4444   if (arg.mode == BLKmode && arg.type)
4445     size = int_size_in_bytes (arg.type);
4446   else
4447     /* No frontends can create types with variable-sized modes, so we
4448        shouldn't be asked to pass or return them.  */
4449     size = GET_MODE_SIZE (arg.mode).to_constant ();
4450
4451   /* Aggregates are passed by reference based on their size.  */
4452   if (arg.aggregate_type_p ())
4453     size = int_size_in_bytes (arg.type);
4454
4455   /* Variable sized arguments are always returned by reference.  */
4456   if (size < 0)
4457     return true;
4458
4459   /* Can this be a candidate to be passed in fp/simd register(s)?  */
4460   if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
4461                                                &dummymode, &nregs,
4462                                                NULL))
4463     return false;
4464
4465   /* Arguments which are variable sized or larger than 2 registers are
4466      passed by reference unless they are a homogenous floating point
4467      aggregate.  */
4468   return size > 2 * UNITS_PER_WORD;
4469 }
4470
4471 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
4472 static bool
4473 aarch64_return_in_msb (const_tree valtype)
4474 {
4475   machine_mode dummy_mode;
4476   int dummy_int;
4477
4478   /* Never happens in little-endian mode.  */
4479   if (!BYTES_BIG_ENDIAN)
4480     return false;
4481
4482   /* Only composite types smaller than or equal to 16 bytes can
4483      be potentially returned in registers.  */
4484   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
4485       || int_size_in_bytes (valtype) <= 0
4486       || int_size_in_bytes (valtype) > 16)
4487     return false;
4488
4489   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4490      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4491      is always passed/returned in the least significant bits of fp/simd
4492      register(s).  */
4493   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
4494                                                &dummy_mode, &dummy_int, NULL))
4495     return false;
4496
4497   return true;
4498 }
4499
4500 /* Implement TARGET_FUNCTION_VALUE.
4501    Define how to find the value returned by a function.  */
4502
4503 static rtx
4504 aarch64_function_value (const_tree type, const_tree func,
4505                         bool outgoing ATTRIBUTE_UNUSED)
4506 {
4507   machine_mode mode;
4508   int unsignedp;
4509   int count;
4510   machine_mode ag_mode;
4511
4512   mode = TYPE_MODE (type);
4513   if (INTEGRAL_TYPE_P (type))
4514     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
4515
4516   if (aarch64_return_in_msb (type))
4517     {
4518       HOST_WIDE_INT size = int_size_in_bytes (type);
4519
4520       if (size % UNITS_PER_WORD != 0)
4521         {
4522           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
4523           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
4524         }
4525     }
4526
4527   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4528                                                &ag_mode, &count, NULL))
4529     {
4530       if (!aarch64_composite_type_p (type, mode))
4531         {
4532           gcc_assert (count == 1 && mode == ag_mode);
4533           return gen_rtx_REG (mode, V0_REGNUM);
4534         }
4535       else
4536         {
4537           int i;
4538           rtx par;
4539
4540           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
4541           for (i = 0; i < count; i++)
4542             {
4543               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
4544               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
4545               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4546               XVECEXP (par, 0, i) = tmp;
4547             }
4548           return par;
4549         }
4550     }
4551   else
4552     return gen_rtx_REG (mode, R0_REGNUM);
4553 }
4554
4555 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
4556    Return true if REGNO is the number of a hard register in which the values
4557    of called function may come back.  */
4558
4559 static bool
4560 aarch64_function_value_regno_p (const unsigned int regno)
4561 {
4562   /* Maximum of 16 bytes can be returned in the general registers.  Examples
4563      of 16-byte return values are: 128-bit integers and 16-byte small
4564      structures (excluding homogeneous floating-point aggregates).  */
4565   if (regno == R0_REGNUM || regno == R1_REGNUM)
4566     return true;
4567
4568   /* Up to four fp/simd registers can return a function value, e.g. a
4569      homogeneous floating-point aggregate having four members.  */
4570   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
4571     return TARGET_FLOAT;
4572
4573   return false;
4574 }
4575
4576 /* Implement TARGET_RETURN_IN_MEMORY.
4577
4578    If the type T of the result of a function is such that
4579      void func (T arg)
4580    would require that arg be passed as a value in a register (or set of
4581    registers) according to the parameter passing rules, then the result
4582    is returned in the same registers as would be used for such an
4583    argument.  */
4584
4585 static bool
4586 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
4587 {
4588   HOST_WIDE_INT size;
4589   machine_mode ag_mode;
4590   int count;
4591
4592   if (!AGGREGATE_TYPE_P (type)
4593       && TREE_CODE (type) != COMPLEX_TYPE
4594       && TREE_CODE (type) != VECTOR_TYPE)
4595     /* Simple scalar types always returned in registers.  */
4596     return false;
4597
4598   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
4599                                                type,
4600                                                &ag_mode,
4601                                                &count,
4602                                                NULL))
4603     return false;
4604
4605   /* Types larger than 2 registers returned in memory.  */
4606   size = int_size_in_bytes (type);
4607   return (size < 0 || size > 2 * UNITS_PER_WORD);
4608 }
4609
4610 static bool
4611 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
4612                                const_tree type, int *nregs)
4613 {
4614   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4615   return aarch64_vfp_is_call_or_return_candidate (mode,
4616                                                   type,
4617                                                   &pcum->aapcs_vfp_rmode,
4618                                                   nregs,
4619                                                   NULL);
4620 }
4621
4622 /* Given MODE and TYPE of a function argument, return the alignment in
4623    bits.  The idea is to suppress any stronger alignment requested by
4624    the user and opt for the natural alignment (specified in AAPCS64 \S
4625    4.1).  ABI_BREAK is set to true if the alignment was incorrectly
4626    calculated in versions of GCC prior to GCC-9.  This is a helper
4627    function for local use only.  */
4628
4629 static unsigned int
4630 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
4631                                 bool *abi_break)
4632 {
4633   *abi_break = false;
4634   if (!type)
4635     return GET_MODE_ALIGNMENT (mode);
4636
4637   if (integer_zerop (TYPE_SIZE (type)))
4638     return 0;
4639
4640   gcc_assert (TYPE_MODE (type) == mode);
4641
4642   if (!AGGREGATE_TYPE_P (type))
4643     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
4644
4645   if (TREE_CODE (type) == ARRAY_TYPE)
4646     return TYPE_ALIGN (TREE_TYPE (type));
4647
4648   unsigned int alignment = 0;
4649   unsigned int bitfield_alignment = 0;
4650   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
4651     if (TREE_CODE (field) == FIELD_DECL)
4652       {
4653         alignment = std::max (alignment, DECL_ALIGN (field));
4654         if (DECL_BIT_FIELD_TYPE (field))
4655           bitfield_alignment
4656             = std::max (bitfield_alignment,
4657                         TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
4658       }
4659
4660   if (bitfield_alignment > alignment)
4661     {
4662       *abi_break = true;
4663       return bitfield_alignment;
4664     }
4665
4666   return alignment;
4667 }
4668
4669 /* Layout a function argument according to the AAPCS64 rules.  The rule
4670    numbers refer to the rule numbers in the AAPCS64.  */
4671
4672 static void
4673 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
4674                     const_tree type,
4675                     bool named ATTRIBUTE_UNUSED)
4676 {
4677   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4678   int ncrn, nvrn, nregs;
4679   bool allocate_ncrn, allocate_nvrn;
4680   HOST_WIDE_INT size;
4681   bool abi_break;
4682
4683   /* We need to do this once per argument.  */
4684   if (pcum->aapcs_arg_processed)
4685     return;
4686
4687   pcum->aapcs_arg_processed = true;
4688
4689   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
4690   if (type)
4691     size = int_size_in_bytes (type);
4692   else
4693     /* No frontends can create types with variable-sized modes, so we
4694        shouldn't be asked to pass or return them.  */
4695     size = GET_MODE_SIZE (mode).to_constant ();
4696   size = ROUND_UP (size, UNITS_PER_WORD);
4697
4698   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
4699   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
4700                                                  mode,
4701                                                  type,
4702                                                  &nregs);
4703
4704   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
4705      The following code thus handles passing by SIMD/FP registers first.  */
4706
4707   nvrn = pcum->aapcs_nvrn;
4708
4709   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
4710      and homogenous short-vector aggregates (HVA).  */
4711   if (allocate_nvrn)
4712     {
4713       if (!TARGET_FLOAT)
4714         aarch64_err_no_fpadvsimd (mode);
4715
4716       if (nvrn + nregs <= NUM_FP_ARG_REGS)
4717         {
4718           pcum->aapcs_nextnvrn = nvrn + nregs;
4719           if (!aarch64_composite_type_p (type, mode))
4720             {
4721               gcc_assert (nregs == 1);
4722               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
4723             }
4724           else
4725             {
4726               rtx par;
4727               int i;
4728               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4729               for (i = 0; i < nregs; i++)
4730                 {
4731                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
4732                                          V0_REGNUM + nvrn + i);
4733                   rtx offset = gen_int_mode
4734                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
4735                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4736                   XVECEXP (par, 0, i) = tmp;
4737                 }
4738               pcum->aapcs_reg = par;
4739             }
4740           return;
4741         }
4742       else
4743         {
4744           /* C.3 NSRN is set to 8.  */
4745           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
4746           goto on_stack;
4747         }
4748     }
4749
4750   ncrn = pcum->aapcs_ncrn;
4751   nregs = size / UNITS_PER_WORD;
4752
4753   /* C6 - C9.  though the sign and zero extension semantics are
4754      handled elsewhere.  This is the case where the argument fits
4755      entirely general registers.  */
4756   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
4757     {
4758       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
4759
4760       /* C.8 if the argument has an alignment of 16 then the NGRN is
4761          rounded up to the next even number.  */
4762       if (nregs == 2
4763           && ncrn % 2
4764           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
4765              comparison is there because for > 16 * BITS_PER_UNIT
4766              alignment nregs should be > 2 and therefore it should be
4767              passed by reference rather than value.  */
4768           && (aarch64_function_arg_alignment (mode, type, &abi_break)
4769               == 16 * BITS_PER_UNIT))
4770         {
4771           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4772             inform (input_location, "parameter passing for argument of type "
4773                     "%qT changed in GCC 9.1", type);
4774           ++ncrn;
4775           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
4776         }
4777
4778       /* NREGS can be 0 when e.g. an empty structure is to be passed.
4779          A reg is still generated for it, but the caller should be smart
4780          enough not to use it.  */
4781       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
4782         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
4783       else
4784         {
4785           rtx par;
4786           int i;
4787
4788           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4789           for (i = 0; i < nregs; i++)
4790             {
4791               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
4792               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
4793                                        GEN_INT (i * UNITS_PER_WORD));
4794               XVECEXP (par, 0, i) = tmp;
4795             }
4796           pcum->aapcs_reg = par;
4797         }
4798
4799       pcum->aapcs_nextncrn = ncrn + nregs;
4800       return;
4801     }
4802
4803   /* C.11  */
4804   pcum->aapcs_nextncrn = NUM_ARG_REGS;
4805
4806   /* The argument is passed on stack; record the needed number of words for
4807      this argument and align the total size if necessary.  */
4808 on_stack:
4809   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
4810
4811   if (aarch64_function_arg_alignment (mode, type, &abi_break)
4812       == 16 * BITS_PER_UNIT)
4813     {
4814       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
4815       if (pcum->aapcs_stack_size != new_size)
4816         {
4817           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4818             inform (input_location, "parameter passing for argument of type "
4819                     "%qT changed in GCC 9.1", type);
4820           pcum->aapcs_stack_size = new_size;
4821         }
4822     }
4823   return;
4824 }
4825
4826 /* Implement TARGET_FUNCTION_ARG.  */
4827
4828 static rtx
4829 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
4830 {
4831   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4832   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
4833
4834   if (arg.end_marker_p ())
4835     return NULL_RTX;
4836
4837   aarch64_layout_arg (pcum_v, arg.mode, arg.type, arg.named);
4838   return pcum->aapcs_reg;
4839 }
4840
4841 void
4842 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
4843                            const_tree fntype ATTRIBUTE_UNUSED,
4844                            rtx libname ATTRIBUTE_UNUSED,
4845                            const_tree fndecl ATTRIBUTE_UNUSED,
4846                            unsigned n_named ATTRIBUTE_UNUSED)
4847 {
4848   pcum->aapcs_ncrn = 0;
4849   pcum->aapcs_nvrn = 0;
4850   pcum->aapcs_nextncrn = 0;
4851   pcum->aapcs_nextnvrn = 0;
4852   pcum->pcs_variant = ARM_PCS_AAPCS64;
4853   pcum->aapcs_reg = NULL_RTX;
4854   pcum->aapcs_arg_processed = false;
4855   pcum->aapcs_stack_words = 0;
4856   pcum->aapcs_stack_size = 0;
4857
4858   if (!TARGET_FLOAT
4859       && fndecl && TREE_PUBLIC (fndecl)
4860       && fntype && fntype != error_mark_node)
4861     {
4862       const_tree type = TREE_TYPE (fntype);
4863       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
4864       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
4865       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
4866                                                    &mode, &nregs, NULL))
4867         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
4868     }
4869   return;
4870 }
4871
4872 static void
4873 aarch64_function_arg_advance (cumulative_args_t pcum_v,
4874                               const function_arg_info &arg)
4875 {
4876   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4877   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
4878     {
4879       aarch64_layout_arg (pcum_v, arg.mode, arg.type, arg.named);
4880       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
4881                   != (pcum->aapcs_stack_words != 0));
4882       pcum->aapcs_arg_processed = false;
4883       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4884       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4885       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4886       pcum->aapcs_stack_words = 0;
4887       pcum->aapcs_reg = NULL_RTX;
4888     }
4889 }
4890
4891 bool
4892 aarch64_function_arg_regno_p (unsigned regno)
4893 {
4894   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4895           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4896 }
4897
4898 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
4899    PARM_BOUNDARY bits of alignment, but will be given anything up
4900    to STACK_BOUNDARY bits if the type requires it.  This makes sure
4901    that both before and after the layout of each argument, the Next
4902    Stacked Argument Address (NSAA) will have a minimum alignment of
4903    8 bytes.  */
4904
4905 static unsigned int
4906 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4907 {
4908   bool abi_break;
4909   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
4910                                                            &abi_break);
4911   if (abi_break & warn_psabi)
4912     inform (input_location, "parameter passing for argument of type "
4913             "%qT changed in GCC 9.1", type);
4914
4915   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4916 }
4917
4918 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
4919
4920 static fixed_size_mode
4921 aarch64_get_reg_raw_mode (int regno)
4922 {
4923   if (TARGET_SVE && FP_REGNUM_P (regno))
4924     /* Don't use the SVE part of the register for __builtin_apply and
4925        __builtin_return.  The SVE registers aren't used by the normal PCS,
4926        so using them there would be a waste of time.  The PCS extensions
4927        for SVE types are fundamentally incompatible with the
4928        __builtin_return/__builtin_apply interface.  */
4929     return as_a <fixed_size_mode> (V16QImode);
4930   return default_get_reg_raw_mode (regno);
4931 }
4932
4933 /* Implement TARGET_FUNCTION_ARG_PADDING.
4934
4935    Small aggregate types are placed in the lowest memory address.
4936
4937    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
4938
4939 static pad_direction
4940 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4941 {
4942   /* On little-endian targets, the least significant byte of every stack
4943      argument is passed at the lowest byte address of the stack slot.  */
4944   if (!BYTES_BIG_ENDIAN)
4945     return PAD_UPWARD;
4946
4947   /* Otherwise, integral, floating-point and pointer types are padded downward:
4948      the least significant byte of a stack argument is passed at the highest
4949      byte address of the stack slot.  */
4950   if (type
4951       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4952          || POINTER_TYPE_P (type))
4953       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4954     return PAD_DOWNWARD;
4955
4956   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
4957   return PAD_UPWARD;
4958 }
4959
4960 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4961
4962    It specifies padding for the last (may also be the only)
4963    element of a block move between registers and memory.  If
4964    assuming the block is in the memory, padding upward means that
4965    the last element is padded after its highest significant byte,
4966    while in downward padding, the last element is padded at the
4967    its least significant byte side.
4968
4969    Small aggregates and small complex types are always padded
4970    upwards.
4971
4972    We don't need to worry about homogeneous floating-point or
4973    short-vector aggregates; their move is not affected by the
4974    padding direction determined here.  Regardless of endianness,
4975    each element of such an aggregate is put in the least
4976    significant bits of a fp/simd register.
4977
4978    Return !BYTES_BIG_ENDIAN if the least significant byte of the
4979    register has useful data, and return the opposite if the most
4980    significant byte does.  */
4981
4982 bool
4983 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4984                      bool first ATTRIBUTE_UNUSED)
4985 {
4986
4987   /* Small composite types are always padded upward.  */
4988   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4989     {
4990       HOST_WIDE_INT size;
4991       if (type)
4992         size = int_size_in_bytes (type);
4993       else
4994         /* No frontends can create types with variable-sized modes, so we
4995            shouldn't be asked to pass or return them.  */
4996         size = GET_MODE_SIZE (mode).to_constant ();
4997       if (size < 2 * UNITS_PER_WORD)
4998         return true;
4999     }
5000
5001   /* Otherwise, use the default padding.  */
5002   return !BYTES_BIG_ENDIAN;
5003 }
5004
5005 static scalar_int_mode
5006 aarch64_libgcc_cmp_return_mode (void)
5007 {
5008   return SImode;
5009 }
5010
5011 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
5012
5013 /* We use the 12-bit shifted immediate arithmetic instructions so values
5014    must be multiple of (1 << 12), i.e. 4096.  */
5015 #define ARITH_FACTOR 4096
5016
5017 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
5018 #error Cannot use simple address calculation for stack probing
5019 #endif
5020
5021 /* The pair of scratch registers used for stack probing.  */
5022 #define PROBE_STACK_FIRST_REG  R9_REGNUM
5023 #define PROBE_STACK_SECOND_REG R10_REGNUM
5024
5025 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
5026    inclusive.  These are offsets from the current stack pointer.  */
5027
5028 static void
5029 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
5030 {
5031   HOST_WIDE_INT size;
5032   if (!poly_size.is_constant (&size))
5033     {
5034       sorry ("stack probes for SVE frames");
5035       return;
5036     }
5037
5038   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
5039
5040   /* See the same assertion on PROBE_INTERVAL above.  */
5041   gcc_assert ((first % ARITH_FACTOR) == 0);
5042
5043   /* See if we have a constant small number of probes to generate.  If so,
5044      that's the easy case.  */
5045   if (size <= PROBE_INTERVAL)
5046     {
5047       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
5048
5049       emit_set_insn (reg1,
5050                      plus_constant (Pmode,
5051                                     stack_pointer_rtx, -(first + base)));
5052       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
5053     }
5054
5055   /* The run-time loop is made up of 8 insns in the generic case while the
5056      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
5057   else if (size <= 4 * PROBE_INTERVAL)
5058     {
5059       HOST_WIDE_INT i, rem;
5060
5061       emit_set_insn (reg1,
5062                      plus_constant (Pmode,
5063                                     stack_pointer_rtx,
5064                                     -(first + PROBE_INTERVAL)));
5065       emit_stack_probe (reg1);
5066
5067       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
5068          it exceeds SIZE.  If only two probes are needed, this will not
5069          generate any code.  Then probe at FIRST + SIZE.  */
5070       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
5071         {
5072           emit_set_insn (reg1,
5073                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
5074           emit_stack_probe (reg1);
5075         }
5076
5077       rem = size - (i - PROBE_INTERVAL);
5078       if (rem > 256)
5079         {
5080           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5081
5082           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
5083           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
5084         }
5085       else
5086         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
5087     }
5088
5089   /* Otherwise, do the same as above, but in a loop.  Note that we must be
5090      extra careful with variables wrapping around because we might be at
5091      the very top (or the very bottom) of the address space and we have
5092      to be able to handle this case properly; in particular, we use an
5093      equality test for the loop condition.  */
5094   else
5095     {
5096       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
5097
5098       /* Step 1: round SIZE to the previous multiple of the interval.  */
5099
5100       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
5101
5102
5103       /* Step 2: compute initial and final value of the loop counter.  */
5104
5105       /* TEST_ADDR = SP + FIRST.  */
5106       emit_set_insn (reg1,
5107                      plus_constant (Pmode, stack_pointer_rtx, -first));
5108
5109       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
5110       HOST_WIDE_INT adjustment = - (first + rounded_size);
5111       if (! aarch64_uimm12_shift (adjustment))
5112         {
5113           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
5114                                           true, Pmode);
5115           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
5116         }
5117       else
5118         emit_set_insn (reg2,
5119                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
5120
5121       /* Step 3: the loop
5122
5123          do
5124            {
5125              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
5126              probe at TEST_ADDR
5127            }
5128          while (TEST_ADDR != LAST_ADDR)
5129
5130          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
5131          until it is equal to ROUNDED_SIZE.  */
5132
5133       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
5134
5135
5136       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
5137          that SIZE is equal to ROUNDED_SIZE.  */
5138
5139       if (size != rounded_size)
5140         {
5141           HOST_WIDE_INT rem = size - rounded_size;
5142
5143           if (rem > 256)
5144             {
5145               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5146
5147               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
5148               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
5149             }
5150           else
5151             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
5152         }
5153     }
5154
5155   /* Make sure nothing is scheduled before we are done.  */
5156   emit_insn (gen_blockage ());
5157 }
5158
5159 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
5160    absolute addresses.  */
5161
5162 const char *
5163 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
5164 {
5165   static int labelno = 0;
5166   char loop_lab[32];
5167   rtx xops[2];
5168
5169   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
5170
5171   /* Loop.  */
5172   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
5173
5174   HOST_WIDE_INT stack_clash_probe_interval
5175     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5176
5177   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
5178   xops[0] = reg1;
5179   HOST_WIDE_INT interval;
5180   if (flag_stack_clash_protection)
5181     interval = stack_clash_probe_interval;
5182   else
5183     interval = PROBE_INTERVAL;
5184
5185   gcc_assert (aarch64_uimm12_shift (interval));
5186   xops[1] = GEN_INT (interval);
5187
5188   output_asm_insn ("sub\t%0, %0, %1", xops);
5189
5190   /* If doing stack clash protection then we probe up by the ABI specified
5191      amount.  We do this because we're dropping full pages at a time in the
5192      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
5193   if (flag_stack_clash_protection)
5194     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
5195   else
5196     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
5197
5198   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
5199      by this amount for each iteration.  */
5200   output_asm_insn ("str\txzr, [%0, %1]", xops);
5201
5202   /* Test if TEST_ADDR == LAST_ADDR.  */
5203   xops[1] = reg2;
5204   output_asm_insn ("cmp\t%0, %1", xops);
5205
5206   /* Branch.  */
5207   fputs ("\tb.ne\t", asm_out_file);
5208   assemble_name_raw (asm_out_file, loop_lab);
5209   fputc ('\n', asm_out_file);
5210
5211   return "";
5212 }
5213
5214 /* Emit the probe loop for doing stack clash probes and stack adjustments for
5215    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
5216    of GUARD_SIZE.  When a probe is emitted it is done at most
5217    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
5218    at most MIN_PROBE_THRESHOLD.  By the end of this function
5219    BASE = BASE - ADJUSTMENT.  */
5220
5221 const char *
5222 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
5223                                       rtx min_probe_threshold, rtx guard_size)
5224 {
5225   /* This function is not allowed to use any instruction generation function
5226      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
5227      so instead emit the code you want using output_asm_insn.  */
5228   gcc_assert (flag_stack_clash_protection);
5229   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
5230   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
5231
5232   /* The minimum required allocation before the residual requires probing.  */
5233   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
5234
5235   /* Clamp the value down to the nearest value that can be used with a cmp.  */
5236   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
5237   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
5238
5239   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
5240   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
5241
5242   static int labelno = 0;
5243   char loop_start_lab[32];
5244   char loop_end_lab[32];
5245   rtx xops[2];
5246
5247   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
5248   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
5249
5250   /* Emit loop start label.  */
5251   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
5252
5253   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
5254   xops[0] = adjustment;
5255   xops[1] = probe_offset_value_rtx;
5256   output_asm_insn ("cmp\t%0, %1", xops);
5257
5258   /* Branch to end if not enough adjustment to probe.  */
5259   fputs ("\tb.lt\t", asm_out_file);
5260   assemble_name_raw (asm_out_file, loop_end_lab);
5261   fputc ('\n', asm_out_file);
5262
5263   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
5264   xops[0] = base;
5265   xops[1] = probe_offset_value_rtx;
5266   output_asm_insn ("sub\t%0, %0, %1", xops);
5267
5268   /* Probe at BASE.  */
5269   xops[1] = const0_rtx;
5270   output_asm_insn ("str\txzr, [%0, %1]", xops);
5271
5272   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
5273   xops[0] = adjustment;
5274   xops[1] = probe_offset_value_rtx;
5275   output_asm_insn ("sub\t%0, %0, %1", xops);
5276
5277   /* Branch to start if still more bytes to allocate.  */
5278   fputs ("\tb\t", asm_out_file);
5279   assemble_name_raw (asm_out_file, loop_start_lab);
5280   fputc ('\n', asm_out_file);
5281
5282   /* No probe leave.  */
5283   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
5284
5285   /* BASE = BASE - ADJUSTMENT.  */
5286   xops[0] = base;
5287   xops[1] = adjustment;
5288   output_asm_insn ("sub\t%0, %0, %1", xops);
5289   return "";
5290 }
5291
5292 /* Determine whether a frame chain needs to be generated.  */
5293 static bool
5294 aarch64_needs_frame_chain (void)
5295 {
5296   /* Force a frame chain for EH returns so the return address is at FP+8.  */
5297   if (frame_pointer_needed || crtl->calls_eh_return)
5298     return true;
5299
5300   /* A leaf function cannot have calls or write LR.  */
5301   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
5302
5303   /* Don't use a frame chain in leaf functions if leaf frame pointers
5304      are disabled.  */
5305   if (flag_omit_leaf_frame_pointer && is_leaf)
5306     return false;
5307
5308   return aarch64_use_frame_pointer;
5309 }
5310
5311 /* Mark the registers that need to be saved by the callee and calculate
5312    the size of the callee-saved registers area and frame record (both FP
5313    and LR may be omitted).  */
5314 static void
5315 aarch64_layout_frame (void)
5316 {
5317   HOST_WIDE_INT offset = 0;
5318   int regno, last_fp_reg = INVALID_REGNUM;
5319   bool simd_function = aarch64_simd_decl_p (cfun->decl);
5320
5321   cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
5322
5323   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
5324      the mid-end is doing.  */
5325   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
5326
5327 #define SLOT_NOT_REQUIRED (-2)
5328 #define SLOT_REQUIRED     (-1)
5329
5330   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
5331   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
5332
5333   /* If this is a non-leaf simd function with calls we assume that
5334      at least one of those calls is to a non-simd function and thus
5335      we must save V8 to V23 in the prologue.  */
5336
5337   if (simd_function && !crtl->is_leaf)
5338     {
5339       for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5340         if (FP_SIMD_SAVED_REGNUM_P (regno))
5341           df_set_regs_ever_live (regno, true);
5342     }
5343
5344   /* First mark all the registers that really need to be saved...  */
5345   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5346     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5347
5348   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5349     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5350
5351   /* ... that includes the eh data registers (if needed)...  */
5352   if (crtl->calls_eh_return)
5353     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
5354       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
5355         = SLOT_REQUIRED;
5356
5357   /* ... and any callee saved register that dataflow says is live.  */
5358   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5359     if (df_regs_ever_live_p (regno)
5360         && (regno == R30_REGNUM
5361             || !call_used_or_fixed_reg_p (regno)))
5362       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5363
5364   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5365     if (df_regs_ever_live_p (regno)
5366         && (!call_used_or_fixed_reg_p (regno)
5367             || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
5368       {
5369         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5370         last_fp_reg = regno;
5371       }
5372
5373   if (cfun->machine->frame.emit_frame_chain)
5374     {
5375       /* FP and LR are placed in the linkage record.  */
5376       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
5377       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
5378       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
5379       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
5380       offset = 2 * UNITS_PER_WORD;
5381     }
5382
5383   /* With stack-clash, LR must be saved in non-leaf functions.  */
5384   gcc_assert (crtl->is_leaf
5385               || (cfun->machine->frame.reg_offset[R30_REGNUM]
5386                   != SLOT_NOT_REQUIRED));
5387
5388   /* Now assign stack slots for them.  */
5389   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5390     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5391       {
5392         cfun->machine->frame.reg_offset[regno] = offset;
5393         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5394           cfun->machine->frame.wb_candidate1 = regno;
5395         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
5396           cfun->machine->frame.wb_candidate2 = regno;
5397         offset += UNITS_PER_WORD;
5398       }
5399
5400   HOST_WIDE_INT max_int_offset = offset;
5401   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5402   bool has_align_gap = offset != max_int_offset;
5403
5404   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5405     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5406       {
5407         /* If there is an alignment gap between integer and fp callee-saves,
5408            allocate the last fp register to it if possible.  */
5409         if (regno == last_fp_reg
5410             && has_align_gap
5411             && !simd_function
5412             && (offset & 8) == 0)
5413           {
5414             cfun->machine->frame.reg_offset[regno] = max_int_offset;
5415             break;
5416           }
5417
5418         cfun->machine->frame.reg_offset[regno] = offset;
5419         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5420           cfun->machine->frame.wb_candidate1 = regno;
5421         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
5422                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
5423           cfun->machine->frame.wb_candidate2 = regno;
5424         offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
5425       }
5426
5427   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5428
5429   cfun->machine->frame.saved_regs_size = offset;
5430
5431   HOST_WIDE_INT varargs_and_saved_regs_size
5432     = offset + cfun->machine->frame.saved_varargs_size;
5433
5434   cfun->machine->frame.hard_fp_offset
5435     = aligned_upper_bound (varargs_and_saved_regs_size
5436                            + get_frame_size (),
5437                            STACK_BOUNDARY / BITS_PER_UNIT);
5438
5439   /* Both these values are already aligned.  */
5440   gcc_assert (multiple_p (crtl->outgoing_args_size,
5441                           STACK_BOUNDARY / BITS_PER_UNIT));
5442   cfun->machine->frame.frame_size
5443     = (cfun->machine->frame.hard_fp_offset
5444        + crtl->outgoing_args_size);
5445
5446   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
5447
5448   cfun->machine->frame.initial_adjust = 0;
5449   cfun->machine->frame.final_adjust = 0;
5450   cfun->machine->frame.callee_adjust = 0;
5451   cfun->machine->frame.callee_offset = 0;
5452
5453   HOST_WIDE_INT max_push_offset = 0;
5454   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
5455     max_push_offset = 512;
5456   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
5457     max_push_offset = 256;
5458
5459   HOST_WIDE_INT const_size, const_fp_offset;
5460   if (cfun->machine->frame.frame_size.is_constant (&const_size)
5461       && const_size < max_push_offset
5462       && known_eq (crtl->outgoing_args_size, 0))
5463     {
5464       /* Simple, small frame with no outgoing arguments:
5465          stp reg1, reg2, [sp, -frame_size]!
5466          stp reg3, reg4, [sp, 16]  */
5467       cfun->machine->frame.callee_adjust = const_size;
5468     }
5469   else if (known_lt (crtl->outgoing_args_size
5470                      + cfun->machine->frame.saved_regs_size, 512)
5471            && !(cfun->calls_alloca
5472                 && known_lt (cfun->machine->frame.hard_fp_offset,
5473                              max_push_offset)))
5474     {
5475       /* Frame with small outgoing arguments:
5476          sub sp, sp, frame_size
5477          stp reg1, reg2, [sp, outgoing_args_size]
5478          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
5479       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
5480       cfun->machine->frame.callee_offset
5481         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
5482     }
5483   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
5484            && const_fp_offset < max_push_offset)
5485     {
5486       /* Frame with large outgoing arguments but a small local area:
5487          stp reg1, reg2, [sp, -hard_fp_offset]!
5488          stp reg3, reg4, [sp, 16]
5489          sub sp, sp, outgoing_args_size  */
5490       cfun->machine->frame.callee_adjust = const_fp_offset;
5491       cfun->machine->frame.final_adjust
5492         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
5493     }
5494   else
5495     {
5496       /* Frame with large local area and outgoing arguments using frame pointer:
5497          sub sp, sp, hard_fp_offset
5498          stp x29, x30, [sp, 0]
5499          add x29, sp, 0
5500          stp reg3, reg4, [sp, 16]
5501          sub sp, sp, outgoing_args_size  */
5502       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
5503       cfun->machine->frame.final_adjust
5504         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
5505     }
5506
5507   cfun->machine->frame.laid_out = true;
5508 }
5509
5510 /* Return true if the register REGNO is saved on entry to
5511    the current function.  */
5512
5513 static bool
5514 aarch64_register_saved_on_entry (int regno)
5515 {
5516   return cfun->machine->frame.reg_offset[regno] >= 0;
5517 }
5518
5519 /* Return the next register up from REGNO up to LIMIT for the callee
5520    to save.  */
5521
5522 static unsigned
5523 aarch64_next_callee_save (unsigned regno, unsigned limit)
5524 {
5525   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
5526     regno ++;
5527   return regno;
5528 }
5529
5530 /* Push the register number REGNO of mode MODE to the stack with write-back
5531    adjusting the stack by ADJUSTMENT.  */
5532
5533 static void
5534 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
5535                            HOST_WIDE_INT adjustment)
5536  {
5537   rtx base_rtx = stack_pointer_rtx;
5538   rtx insn, reg, mem;
5539
5540   reg = gen_rtx_REG (mode, regno);
5541   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
5542                             plus_constant (Pmode, base_rtx, -adjustment));
5543   mem = gen_frame_mem (mode, mem);
5544
5545   insn = emit_move_insn (mem, reg);
5546   RTX_FRAME_RELATED_P (insn) = 1;
5547 }
5548
5549 /* Generate and return an instruction to store the pair of registers
5550    REG and REG2 of mode MODE to location BASE with write-back adjusting
5551    the stack location BASE by ADJUSTMENT.  */
5552
5553 static rtx
5554 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5555                           HOST_WIDE_INT adjustment)
5556 {
5557   switch (mode)
5558     {
5559     case E_DImode:
5560       return gen_storewb_pairdi_di (base, base, reg, reg2,
5561                                     GEN_INT (-adjustment),
5562                                     GEN_INT (UNITS_PER_WORD - adjustment));
5563     case E_DFmode:
5564       return gen_storewb_pairdf_di (base, base, reg, reg2,
5565                                     GEN_INT (-adjustment),
5566                                     GEN_INT (UNITS_PER_WORD - adjustment));
5567     case E_TFmode:
5568       return gen_storewb_pairtf_di (base, base, reg, reg2,
5569                                     GEN_INT (-adjustment),
5570                                     GEN_INT (UNITS_PER_VREG - adjustment));
5571     default:
5572       gcc_unreachable ();
5573     }
5574 }
5575
5576 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
5577    stack pointer by ADJUSTMENT.  */
5578
5579 static void
5580 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
5581 {
5582   rtx_insn *insn;
5583   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5584
5585   if (regno2 == INVALID_REGNUM)
5586     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
5587
5588   rtx reg1 = gen_rtx_REG (mode, regno1);
5589   rtx reg2 = gen_rtx_REG (mode, regno2);
5590
5591   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
5592                                               reg2, adjustment));
5593   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
5594   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5595   RTX_FRAME_RELATED_P (insn) = 1;
5596 }
5597
5598 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
5599    adjusting it by ADJUSTMENT afterwards.  */
5600
5601 static rtx
5602 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5603                          HOST_WIDE_INT adjustment)
5604 {
5605   switch (mode)
5606     {
5607     case E_DImode:
5608       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
5609                                    GEN_INT (UNITS_PER_WORD));
5610     case E_DFmode:
5611       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
5612                                    GEN_INT (UNITS_PER_WORD));
5613     case E_TFmode:
5614       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
5615                                    GEN_INT (UNITS_PER_VREG));
5616     default:
5617       gcc_unreachable ();
5618     }
5619 }
5620
5621 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
5622    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
5623    into CFI_OPS.  */
5624
5625 static void
5626 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
5627                   rtx *cfi_ops)
5628 {
5629   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5630   rtx reg1 = gen_rtx_REG (mode, regno1);
5631
5632   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
5633
5634   if (regno2 == INVALID_REGNUM)
5635     {
5636       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
5637       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
5638       emit_move_insn (reg1, gen_frame_mem (mode, mem));
5639     }
5640   else
5641     {
5642       rtx reg2 = gen_rtx_REG (mode, regno2);
5643       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5644       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
5645                                           reg2, adjustment));
5646     }
5647 }
5648
5649 /* Generate and return a store pair instruction of mode MODE to store
5650    register REG1 to MEM1 and register REG2 to MEM2.  */
5651
5652 static rtx
5653 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
5654                         rtx reg2)
5655 {
5656   switch (mode)
5657     {
5658     case E_DImode:
5659       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
5660
5661     case E_DFmode:
5662       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
5663
5664     case E_TFmode:
5665       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
5666
5667     default:
5668       gcc_unreachable ();
5669     }
5670 }
5671
5672 /* Generate and regurn a load pair isntruction of mode MODE to load register
5673    REG1 from MEM1 and register REG2 from MEM2.  */
5674
5675 static rtx
5676 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
5677                        rtx mem2)
5678 {
5679   switch (mode)
5680     {
5681     case E_DImode:
5682       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
5683
5684     case E_DFmode:
5685       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
5686
5687     case E_TFmode:
5688       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
5689
5690     default:
5691       gcc_unreachable ();
5692     }
5693 }
5694
5695 /* Return TRUE if return address signing should be enabled for the current
5696    function, otherwise return FALSE.  */
5697
5698 bool
5699 aarch64_return_address_signing_enabled (void)
5700 {
5701   /* This function should only be called after frame laid out.   */
5702   gcc_assert (cfun->machine->frame.laid_out);
5703
5704   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
5705      if its LR is pushed onto stack.  */
5706   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
5707           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
5708               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
5709 }
5710
5711 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
5712 bool
5713 aarch64_bti_enabled (void)
5714 {
5715   return (aarch64_enable_bti == 1);
5716 }
5717
5718 /* Emit code to save the callee-saved registers from register number START
5719    to LIMIT to the stack at the location starting at offset START_OFFSET,
5720    skipping any write-back candidates if SKIP_WB is true.  */
5721
5722 static void
5723 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
5724                            unsigned start, unsigned limit, bool skip_wb)
5725 {
5726   rtx_insn *insn;
5727   unsigned regno;
5728   unsigned regno2;
5729
5730   for (regno = aarch64_next_callee_save (start, limit);
5731        regno <= limit;
5732        regno = aarch64_next_callee_save (regno + 1, limit))
5733     {
5734       rtx reg, mem;
5735       poly_int64 offset;
5736       int offset_diff;
5737
5738       if (skip_wb
5739           && (regno == cfun->machine->frame.wb_candidate1
5740               || regno == cfun->machine->frame.wb_candidate2))
5741         continue;
5742
5743       if (cfun->machine->reg_is_wrapped_separately[regno])
5744        continue;
5745
5746       reg = gen_rtx_REG (mode, regno);
5747       offset = start_offset + cfun->machine->frame.reg_offset[regno];
5748       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5749                                                 offset));
5750
5751       regno2 = aarch64_next_callee_save (regno + 1, limit);
5752       offset_diff = cfun->machine->frame.reg_offset[regno2]
5753                     - cfun->machine->frame.reg_offset[regno];
5754
5755       if (regno2 <= limit
5756           && !cfun->machine->reg_is_wrapped_separately[regno2]
5757           && known_eq (GET_MODE_SIZE (mode), offset_diff))
5758         {
5759           rtx reg2 = gen_rtx_REG (mode, regno2);
5760           rtx mem2;
5761
5762           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5763           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5764                                                      offset));
5765           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
5766                                                     reg2));
5767
5768           /* The first part of a frame-related parallel insn is
5769              always assumed to be relevant to the frame
5770              calculations; subsequent parts, are only
5771              frame-related if explicitly marked.  */
5772           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5773           regno = regno2;
5774         }
5775       else
5776         insn = emit_move_insn (mem, reg);
5777
5778       RTX_FRAME_RELATED_P (insn) = 1;
5779     }
5780 }
5781
5782 /* Emit code to restore the callee registers of mode MODE from register
5783    number START up to and including LIMIT.  Restore from the stack offset
5784    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
5785    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
5786
5787 static void
5788 aarch64_restore_callee_saves (machine_mode mode,
5789                               poly_int64 start_offset, unsigned start,
5790                               unsigned limit, bool skip_wb, rtx *cfi_ops)
5791 {
5792   rtx base_rtx = stack_pointer_rtx;
5793   unsigned regno;
5794   unsigned regno2;
5795   poly_int64 offset;
5796
5797   for (regno = aarch64_next_callee_save (start, limit);
5798        regno <= limit;
5799        regno = aarch64_next_callee_save (regno + 1, limit))
5800     {
5801       if (cfun->machine->reg_is_wrapped_separately[regno])
5802        continue;
5803
5804       rtx reg, mem;
5805       int offset_diff;
5806
5807       if (skip_wb
5808           && (regno == cfun->machine->frame.wb_candidate1
5809               || regno == cfun->machine->frame.wb_candidate2))
5810         continue;
5811
5812       reg = gen_rtx_REG (mode, regno);
5813       offset = start_offset + cfun->machine->frame.reg_offset[regno];
5814       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5815
5816       regno2 = aarch64_next_callee_save (regno + 1, limit);
5817       offset_diff = cfun->machine->frame.reg_offset[regno2]
5818                     - cfun->machine->frame.reg_offset[regno];
5819
5820       if (regno2 <= limit
5821           && !cfun->machine->reg_is_wrapped_separately[regno2]
5822           && known_eq (GET_MODE_SIZE (mode), offset_diff))
5823         {
5824           rtx reg2 = gen_rtx_REG (mode, regno2);
5825           rtx mem2;
5826
5827           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5828           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5829           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5830
5831           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5832           regno = regno2;
5833         }
5834       else
5835         emit_move_insn (reg, mem);
5836       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
5837     }
5838 }
5839
5840 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5841    of MODE.  */
5842
5843 static inline bool
5844 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5845 {
5846   HOST_WIDE_INT multiple;
5847   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5848           && IN_RANGE (multiple, -8, 7));
5849 }
5850
5851 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5852    of MODE.  */
5853
5854 static inline bool
5855 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5856 {
5857   HOST_WIDE_INT multiple;
5858   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5859           && IN_RANGE (multiple, 0, 63));
5860 }
5861
5862 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5863    of MODE.  */
5864
5865 bool
5866 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5867 {
5868   HOST_WIDE_INT multiple;
5869   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5870           && IN_RANGE (multiple, -64, 63));
5871 }
5872
5873 /* Return true if OFFSET is a signed 9-bit value.  */
5874
5875 bool
5876 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
5877                                        poly_int64 offset)
5878 {
5879   HOST_WIDE_INT const_offset;
5880   return (offset.is_constant (&const_offset)
5881           && IN_RANGE (const_offset, -256, 255));
5882 }
5883
5884 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5885    of MODE.  */
5886
5887 static inline bool
5888 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5889 {
5890   HOST_WIDE_INT multiple;
5891   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5892           && IN_RANGE (multiple, -256, 255));
5893 }
5894
5895 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5896    of MODE.  */
5897
5898 static inline bool
5899 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5900 {
5901   HOST_WIDE_INT multiple;
5902   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5903           && IN_RANGE (multiple, 0, 4095));
5904 }
5905
5906 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
5907
5908 static sbitmap
5909 aarch64_get_separate_components (void)
5910 {
5911   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5912   bitmap_clear (components);
5913
5914   /* The registers we need saved to the frame.  */
5915   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5916     if (aarch64_register_saved_on_entry (regno))
5917       {
5918         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5919         if (!frame_pointer_needed)
5920           offset += cfun->machine->frame.frame_size
5921                     - cfun->machine->frame.hard_fp_offset;
5922         /* Check that we can access the stack slot of the register with one
5923            direct load with no adjustments needed.  */
5924         if (offset_12bit_unsigned_scaled_p (DImode, offset))
5925           bitmap_set_bit (components, regno);
5926       }
5927
5928   /* Don't mess with the hard frame pointer.  */
5929   if (frame_pointer_needed)
5930     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5931
5932   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5933   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5934   /* If registers have been chosen to be stored/restored with
5935      writeback don't interfere with them to avoid having to output explicit
5936      stack adjustment instructions.  */
5937   if (reg2 != INVALID_REGNUM)
5938     bitmap_clear_bit (components, reg2);
5939   if (reg1 != INVALID_REGNUM)
5940     bitmap_clear_bit (components, reg1);
5941
5942   bitmap_clear_bit (components, LR_REGNUM);
5943   bitmap_clear_bit (components, SP_REGNUM);
5944
5945   return components;
5946 }
5947
5948 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
5949
5950 static sbitmap
5951 aarch64_components_for_bb (basic_block bb)
5952 {
5953   bitmap in = DF_LIVE_IN (bb);
5954   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5955   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5956   bool simd_function = aarch64_simd_decl_p (cfun->decl);
5957
5958   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5959   bitmap_clear (components);
5960
5961   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
5962   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5963     if ((!call_used_or_fixed_reg_p (regno)
5964         || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5965        && (bitmap_bit_p (in, regno)
5966            || bitmap_bit_p (gen, regno)
5967            || bitmap_bit_p (kill, regno)))
5968       {
5969         unsigned regno2, offset, offset2;
5970         bitmap_set_bit (components, regno);
5971
5972         /* If there is a callee-save at an adjacent offset, add it too
5973            to increase the use of LDP/STP.  */
5974         offset = cfun->machine->frame.reg_offset[regno];
5975         regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5976
5977         if (regno2 <= LAST_SAVED_REGNUM)
5978           {
5979             offset2 = cfun->machine->frame.reg_offset[regno2];
5980             if ((offset & ~8) == (offset2 & ~8))
5981               bitmap_set_bit (components, regno2);
5982           }
5983       }
5984
5985   return components;
5986 }
5987
5988 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5989    Nothing to do for aarch64.  */
5990
5991 static void
5992 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5993 {
5994 }
5995
5996 /* Return the next set bit in BMP from START onwards.  Return the total number
5997    of bits in BMP if no set bit is found at or after START.  */
5998
5999 static unsigned int
6000 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
6001 {
6002   unsigned int nbits = SBITMAP_SIZE (bmp);
6003   if (start == nbits)
6004     return start;
6005
6006   gcc_assert (start < nbits);
6007   for (unsigned int i = start; i < nbits; i++)
6008     if (bitmap_bit_p (bmp, i))
6009       return i;
6010
6011   return nbits;
6012 }
6013
6014 /* Do the work for aarch64_emit_prologue_components and
6015    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
6016    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
6017    for these components or the epilogue sequence.  That is, it determines
6018    whether we should emit stores or loads and what kind of CFA notes to attach
6019    to the insns.  Otherwise the logic for the two sequences is very
6020    similar.  */
6021
6022 static void
6023 aarch64_process_components (sbitmap components, bool prologue_p)
6024 {
6025   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
6026                              ? HARD_FRAME_POINTER_REGNUM
6027                              : STACK_POINTER_REGNUM);
6028
6029   unsigned last_regno = SBITMAP_SIZE (components);
6030   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
6031   rtx_insn *insn = NULL;
6032
6033   while (regno != last_regno)
6034     {
6035       /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
6036          so DFmode for the vector registers is enough.  For simd functions
6037          we want to save the low 128 bits.  */
6038       machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
6039
6040       rtx reg = gen_rtx_REG (mode, regno);
6041       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
6042       if (!frame_pointer_needed)
6043         offset += cfun->machine->frame.frame_size
6044                   - cfun->machine->frame.hard_fp_offset;
6045       rtx addr = plus_constant (Pmode, ptr_reg, offset);
6046       rtx mem = gen_frame_mem (mode, addr);
6047
6048       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
6049       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
6050       /* No more registers to handle after REGNO.
6051          Emit a single save/restore and exit.  */
6052       if (regno2 == last_regno)
6053         {
6054           insn = emit_insn (set);
6055           RTX_FRAME_RELATED_P (insn) = 1;
6056           if (prologue_p)
6057             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6058           else
6059             add_reg_note (insn, REG_CFA_RESTORE, reg);
6060           break;
6061         }
6062
6063       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
6064       /* The next register is not of the same class or its offset is not
6065          mergeable with the current one into a pair.  */
6066       if (!satisfies_constraint_Ump (mem)
6067           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
6068           || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
6069           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
6070                        GET_MODE_SIZE (mode)))
6071         {
6072           insn = emit_insn (set);
6073           RTX_FRAME_RELATED_P (insn) = 1;
6074           if (prologue_p)
6075             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6076           else
6077             add_reg_note (insn, REG_CFA_RESTORE, reg);
6078
6079           regno = regno2;
6080           continue;
6081         }
6082
6083       /* REGNO2 can be saved/restored in a pair with REGNO.  */
6084       rtx reg2 = gen_rtx_REG (mode, regno2);
6085       if (!frame_pointer_needed)
6086         offset2 += cfun->machine->frame.frame_size
6087                   - cfun->machine->frame.hard_fp_offset;
6088       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
6089       rtx mem2 = gen_frame_mem (mode, addr2);
6090       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
6091                              : gen_rtx_SET (reg2, mem2);
6092
6093       if (prologue_p)
6094         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
6095       else
6096         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
6097
6098       RTX_FRAME_RELATED_P (insn) = 1;
6099       if (prologue_p)
6100         {
6101           add_reg_note (insn, REG_CFA_OFFSET, set);
6102           add_reg_note (insn, REG_CFA_OFFSET, set2);
6103         }
6104       else
6105         {
6106           add_reg_note (insn, REG_CFA_RESTORE, reg);
6107           add_reg_note (insn, REG_CFA_RESTORE, reg2);
6108         }
6109
6110       regno = aarch64_get_next_set_bit (components, regno2 + 1);
6111     }
6112 }
6113
6114 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
6115
6116 static void
6117 aarch64_emit_prologue_components (sbitmap components)
6118 {
6119   aarch64_process_components (components, true);
6120 }
6121
6122 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
6123
6124 static void
6125 aarch64_emit_epilogue_components (sbitmap components)
6126 {
6127   aarch64_process_components (components, false);
6128 }
6129
6130 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
6131
6132 static void
6133 aarch64_set_handled_components (sbitmap components)
6134 {
6135   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6136     if (bitmap_bit_p (components, regno))
6137       cfun->machine->reg_is_wrapped_separately[regno] = true;
6138 }
6139
6140 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
6141    determining the probe offset for alloca.  */
6142
6143 static HOST_WIDE_INT
6144 aarch64_stack_clash_protection_alloca_probe_range (void)
6145 {
6146   return STACK_CLASH_CALLER_GUARD;
6147 }
6148
6149
6150 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
6151    registers.  If POLY_SIZE is not large enough to require a probe this function
6152    will only adjust the stack.  When allocating the stack space
6153    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
6154    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
6155    arguments.  If we are then we ensure that any allocation larger than the ABI
6156    defined buffer needs a probe so that the invariant of having a 1KB buffer is
6157    maintained.
6158
6159    We emit barriers after each stack adjustment to prevent optimizations from
6160    breaking the invariant that we never drop the stack more than a page.  This
6161    invariant is needed to make it easier to correctly handle asynchronous
6162    events, e.g. if we were to allow the stack to be dropped by more than a page
6163    and then have multiple probes up and we take a signal somewhere in between
6164    then the signal handler doesn't know the state of the stack and can make no
6165    assumptions about which pages have been probed.  */
6166
6167 static void
6168 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
6169                                         poly_int64 poly_size,
6170                                         bool frame_related_p,
6171                                         bool final_adjustment_p)
6172 {
6173   HOST_WIDE_INT guard_size
6174     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6175   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6176   /* When doing the final adjustment for the outgoing argument size we can't
6177      assume that LR was saved at position 0.  So subtract it's offset from the
6178      ABI safe buffer so that we don't accidentally allow an adjustment that
6179      would result in an allocation larger than the ABI buffer without
6180      probing.  */
6181   HOST_WIDE_INT min_probe_threshold
6182     = final_adjustment_p
6183       ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
6184       : guard_size - guard_used_by_caller;
6185
6186   poly_int64 frame_size = cfun->machine->frame.frame_size;
6187
6188   /* We should always have a positive probe threshold.  */
6189   gcc_assert (min_probe_threshold > 0);
6190
6191   if (flag_stack_clash_protection && !final_adjustment_p)
6192     {
6193       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6194       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6195
6196       if (known_eq (frame_size, 0))
6197         {
6198           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
6199         }
6200       else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
6201                && known_lt (final_adjust, guard_used_by_caller))
6202         {
6203           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
6204         }
6205     }
6206
6207   /* If SIZE is not large enough to require probing, just adjust the stack and
6208      exit.  */
6209   if (known_lt (poly_size, min_probe_threshold)
6210       || !flag_stack_clash_protection)
6211     {
6212       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
6213       return;
6214     }
6215
6216   HOST_WIDE_INT size;
6217   /* Handle the SVE non-constant case first.  */
6218   if (!poly_size.is_constant (&size))
6219     {
6220      if (dump_file)
6221       {
6222         fprintf (dump_file, "Stack clash SVE prologue: ");
6223         print_dec (poly_size, dump_file);
6224         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
6225       }
6226
6227       /* First calculate the amount of bytes we're actually spilling.  */
6228       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
6229                           poly_size, temp1, temp2, false, true);
6230
6231       rtx_insn *insn = get_last_insn ();
6232
6233       if (frame_related_p)
6234         {
6235           /* This is done to provide unwinding information for the stack
6236              adjustments we're about to do, however to prevent the optimizers
6237              from removing the R11 move and leaving the CFA note (which would be
6238              very wrong) we tie the old and new stack pointer together.
6239              The tie will expand to nothing but the optimizers will not touch
6240              the instruction.  */
6241           rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6242           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
6243           emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
6244
6245           /* We want the CFA independent of the stack pointer for the
6246              duration of the loop.  */
6247           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
6248           RTX_FRAME_RELATED_P (insn) = 1;
6249         }
6250
6251       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
6252       rtx guard_const = gen_int_mode (guard_size, Pmode);
6253
6254       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
6255                                                    stack_pointer_rtx, temp1,
6256                                                    probe_const, guard_const));
6257
6258       /* Now reset the CFA register if needed.  */
6259       if (frame_related_p)
6260         {
6261           add_reg_note (insn, REG_CFA_DEF_CFA,
6262                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
6263                                       gen_int_mode (poly_size, Pmode)));
6264           RTX_FRAME_RELATED_P (insn) = 1;
6265         }
6266
6267       return;
6268     }
6269
6270   if (dump_file)
6271     fprintf (dump_file,
6272              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
6273              " bytes, probing will be required.\n", size);
6274
6275   /* Round size to the nearest multiple of guard_size, and calculate the
6276      residual as the difference between the original size and the rounded
6277      size.  */
6278   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
6279   HOST_WIDE_INT residual = size - rounded_size;
6280
6281   /* We can handle a small number of allocations/probes inline.  Otherwise
6282      punt to a loop.  */
6283   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
6284     {
6285       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
6286         {
6287           aarch64_sub_sp (NULL, temp2, guard_size, true);
6288           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6289                                            guard_used_by_caller));
6290           emit_insn (gen_blockage ());
6291         }
6292       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
6293     }
6294   else
6295     {
6296       /* Compute the ending address.  */
6297       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
6298                           temp1, NULL, false, true);
6299       rtx_insn *insn = get_last_insn ();
6300
6301       /* For the initial allocation, we don't have a frame pointer
6302          set up, so we always need CFI notes.  If we're doing the
6303          final allocation, then we may have a frame pointer, in which
6304          case it is the CFA, otherwise we need CFI notes.
6305
6306          We can determine which allocation we are doing by looking at
6307          the value of FRAME_RELATED_P since the final allocations are not
6308          frame related.  */
6309       if (frame_related_p)
6310         {
6311           /* We want the CFA independent of the stack pointer for the
6312              duration of the loop.  */
6313           add_reg_note (insn, REG_CFA_DEF_CFA,
6314                         plus_constant (Pmode, temp1, rounded_size));
6315           RTX_FRAME_RELATED_P (insn) = 1;
6316         }
6317
6318       /* This allocates and probes the stack.  Note that this re-uses some of
6319          the existing Ada stack protection code.  However we are guaranteed not
6320          to enter the non loop or residual branches of that code.
6321
6322          The non-loop part won't be entered because if our allocation amount
6323          doesn't require a loop, the case above would handle it.
6324
6325          The residual amount won't be entered because TEMP1 is a mutliple of
6326          the allocation size.  The residual will always be 0.  As such, the only
6327          part we are actually using from that code is the loop setup.  The
6328          actual probing is done in aarch64_output_probe_stack_range.  */
6329       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
6330                                                stack_pointer_rtx, temp1));
6331
6332       /* Now reset the CFA register if needed.  */
6333       if (frame_related_p)
6334         {
6335           add_reg_note (insn, REG_CFA_DEF_CFA,
6336                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
6337           RTX_FRAME_RELATED_P (insn) = 1;
6338         }
6339
6340       emit_insn (gen_blockage ());
6341       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
6342     }
6343
6344   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
6345      be probed.  This maintains the requirement that each page is probed at
6346      least once.  For initial probing we probe only if the allocation is
6347      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
6348      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
6349      GUARD_SIZE.  This works that for any allocation that is large enough to
6350      trigger a probe here, we'll have at least one, and if they're not large
6351      enough for this code to emit anything for them, The page would have been
6352      probed by the saving of FP/LR either by this function or any callees.  If
6353      we don't have any callees then we won't have more stack adjustments and so
6354      are still safe.  */
6355   if (residual)
6356     {
6357       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
6358       /* If we're doing final adjustments, and we've done any full page
6359          allocations then any residual needs to be probed.  */
6360       if (final_adjustment_p && rounded_size != 0)
6361         min_probe_threshold = 0;
6362       /* If doing a small final adjustment, we always probe at offset 0.
6363          This is done to avoid issues when LR is not at position 0 or when
6364          the final adjustment is smaller than the probing offset.  */
6365       else if (final_adjustment_p && rounded_size == 0)
6366         residual_probe_offset = 0;
6367
6368       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
6369       if (residual >= min_probe_threshold)
6370         {
6371           if (dump_file)
6372             fprintf (dump_file,
6373                      "Stack clash AArch64 prologue residuals: "
6374                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
6375                      "\n", residual);
6376
6377             emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6378                                              residual_probe_offset));
6379           emit_insn (gen_blockage ());
6380         }
6381     }
6382 }
6383
6384 /* Return 1 if the register is used by the epilogue.  We need to say the
6385    return register is used, but only after epilogue generation is complete.
6386    Note that in the case of sibcalls, the values "used by the epilogue" are
6387    considered live at the start of the called function.
6388
6389    For SIMD functions we need to return 1 for FP registers that are saved and
6390    restored by a function but are not zero in call_used_regs.  If we do not do
6391    this optimizations may remove the restore of the register.  */
6392
6393 int
6394 aarch64_epilogue_uses (int regno)
6395 {
6396   if (epilogue_completed)
6397     {
6398       if (regno == LR_REGNUM)
6399         return 1;
6400       if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
6401         return 1;
6402     }
6403   return 0;
6404 }
6405
6406 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6407    is saved at BASE + OFFSET.  */
6408
6409 static void
6410 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
6411                             rtx base, poly_int64 offset)
6412 {
6413   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
6414   add_reg_note (insn, REG_CFA_EXPRESSION,
6415                 gen_rtx_SET (mem, regno_reg_rtx[reg]));
6416 }
6417
6418 /* AArch64 stack frames generated by this compiler look like:
6419
6420         +-------------------------------+
6421         |                               |
6422         |  incoming stack arguments     |
6423         |                               |
6424         +-------------------------------+
6425         |                               | <-- incoming stack pointer (aligned)
6426         |  callee-allocated save area   |
6427         |  for register varargs         |
6428         |                               |
6429         +-------------------------------+
6430         |  local variables              | <-- frame_pointer_rtx
6431         |                               |
6432         +-------------------------------+
6433         |  padding                      | \
6434         +-------------------------------+  |
6435         |  callee-saved registers       |  | frame.saved_regs_size
6436         +-------------------------------+  |
6437         |  LR'                          |  |
6438         +-------------------------------+  |
6439         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
6440         +-------------------------------+
6441         |  dynamic allocation           |
6442         +-------------------------------+
6443         |  padding                      |
6444         +-------------------------------+
6445         |  outgoing stack arguments     | <-- arg_pointer
6446         |                               |
6447         +-------------------------------+
6448         |                               | <-- stack_pointer_rtx (aligned)
6449
6450    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
6451    but leave frame_pointer_rtx and hard_frame_pointer_rtx
6452    unchanged.
6453
6454    By default for stack-clash we assume the guard is at least 64KB, but this
6455    value is configurable to either 4KB or 64KB.  We also force the guard size to
6456    be the same as the probing interval and both values are kept in sync.
6457
6458    With those assumptions the callee can allocate up to 63KB (or 3KB depending
6459    on the guard size) of stack space without probing.
6460
6461    When probing is needed, we emit a probe at the start of the prologue
6462    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
6463
6464    We have to track how much space has been allocated and the only stores
6465    to the stack we track as implicit probes are the FP/LR stores.
6466
6467    For outgoing arguments we probe if the size is larger than 1KB, such that
6468    the ABI specified buffer is maintained for the next callee.
6469
6470    The following registers are reserved during frame layout and should not be
6471    used for any other purpose:
6472
6473    - r11: Used by stack clash protection when SVE is enabled.
6474    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
6475    - r14 and r15: Used for speculation tracking.
6476    - r16(IP0), r17(IP1): Used by indirect tailcalls.
6477    - r30(LR), r29(FP): Used by standard frame layout.
6478
6479    These registers must be avoided in frame layout related code unless the
6480    explicit intention is to interact with one of the features listed above.  */
6481
6482 /* Generate the prologue instructions for entry into a function.
6483    Establish the stack frame by decreasing the stack pointer with a
6484    properly calculated size and, if necessary, create a frame record
6485    filled with the values of LR and previous frame pointer.  The
6486    current FP is also set up if it is in use.  */
6487
6488 void
6489 aarch64_expand_prologue (void)
6490 {
6491   poly_int64 frame_size = cfun->machine->frame.frame_size;
6492   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6493   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6494   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6495   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6496   unsigned reg1 = cfun->machine->frame.wb_candidate1;
6497   unsigned reg2 = cfun->machine->frame.wb_candidate2;
6498   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
6499   rtx_insn *insn;
6500
6501   /* Sign return address for functions.  */
6502   if (aarch64_return_address_signing_enabled ())
6503     {
6504       switch (aarch64_ra_sign_key)
6505         {
6506           case AARCH64_KEY_A:
6507             insn = emit_insn (gen_paciasp ());
6508             break;
6509           case AARCH64_KEY_B:
6510             insn = emit_insn (gen_pacibsp ());
6511             break;
6512           default:
6513             gcc_unreachable ();
6514         }
6515       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6516       RTX_FRAME_RELATED_P (insn) = 1;
6517     }
6518
6519   if (flag_stack_usage_info)
6520     current_function_static_stack_size = constant_lower_bound (frame_size);
6521
6522   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
6523     {
6524       if (crtl->is_leaf && !cfun->calls_alloca)
6525         {
6526           if (maybe_gt (frame_size, PROBE_INTERVAL)
6527               && maybe_gt (frame_size, get_stack_check_protect ()))
6528             aarch64_emit_probe_stack_range (get_stack_check_protect (),
6529                                             (frame_size
6530                                              - get_stack_check_protect ()));
6531         }
6532       else if (maybe_gt (frame_size, 0))
6533         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
6534     }
6535
6536   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6537   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6538
6539   /* In theory we should never have both an initial adjustment
6540      and a callee save adjustment.  Verify that is the case since the
6541      code below does not handle it for -fstack-clash-protection.  */
6542   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
6543
6544   /* Will only probe if the initial adjustment is larger than the guard
6545      less the amount of the guard reserved for use by the caller's
6546      outgoing args.  */
6547   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
6548                                           true, false);
6549
6550   if (callee_adjust != 0)
6551     aarch64_push_regs (reg1, reg2, callee_adjust);
6552
6553   if (emit_frame_chain)
6554     {
6555       poly_int64 reg_offset = callee_adjust;
6556       if (callee_adjust == 0)
6557         {
6558           reg1 = R29_REGNUM;
6559           reg2 = R30_REGNUM;
6560           reg_offset = callee_offset;
6561           aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
6562         }
6563       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
6564                           stack_pointer_rtx, callee_offset,
6565                           tmp1_rtx, tmp0_rtx, frame_pointer_needed);
6566       if (frame_pointer_needed && !frame_size.is_constant ())
6567         {
6568           /* Variable-sized frames need to describe the save slot
6569              address using DW_CFA_expression rather than DW_CFA_offset.
6570              This means that, without taking further action, the
6571              locations of the registers that we've already saved would
6572              remain based on the stack pointer even after we redefine
6573              the CFA based on the frame pointer.  We therefore need new
6574              DW_CFA_expressions to re-express the save slots with addresses
6575              based on the frame pointer.  */
6576           rtx_insn *insn = get_last_insn ();
6577           gcc_assert (RTX_FRAME_RELATED_P (insn));
6578
6579           /* Add an explicit CFA definition if this was previously
6580              implicit.  */
6581           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
6582             {
6583               rtx src = plus_constant (Pmode, stack_pointer_rtx,
6584                                        callee_offset);
6585               add_reg_note (insn, REG_CFA_ADJUST_CFA,
6586                             gen_rtx_SET (hard_frame_pointer_rtx, src));
6587             }
6588
6589           /* Change the save slot expressions for the registers that
6590              we've already saved.  */
6591           reg_offset -= callee_offset;
6592           aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
6593                                       reg_offset + UNITS_PER_WORD);
6594           aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
6595                                       reg_offset);
6596         }
6597       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
6598     }
6599
6600   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6601                              callee_adjust != 0 || emit_frame_chain);
6602   if (aarch64_simd_decl_p (cfun->decl))
6603     aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6604                                callee_adjust != 0 || emit_frame_chain);
6605   else
6606     aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6607                                callee_adjust != 0 || emit_frame_chain);
6608
6609   /* We may need to probe the final adjustment if it is larger than the guard
6610      that is assumed by the called.  */
6611   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
6612                                           !frame_pointer_needed, true);
6613 }
6614
6615 /* Return TRUE if we can use a simple_return insn.
6616
6617    This function checks whether the callee saved stack is empty, which
6618    means no restore actions are need. The pro_and_epilogue will use
6619    this to check whether shrink-wrapping opt is feasible.  */
6620
6621 bool
6622 aarch64_use_return_insn_p (void)
6623 {
6624   if (!reload_completed)
6625     return false;
6626
6627   if (crtl->profile)
6628     return false;
6629
6630   return known_eq (cfun->machine->frame.frame_size, 0);
6631 }
6632
6633 /* Return false for non-leaf SIMD functions in order to avoid
6634    shrink-wrapping them.  Doing this will lose the necessary
6635    save/restore of FP registers.  */
6636
6637 bool
6638 aarch64_use_simple_return_insn_p (void)
6639 {
6640   if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
6641     return false;
6642
6643   return true;
6644 }
6645
6646 /* Generate the epilogue instructions for returning from a function.
6647    This is almost exactly the reverse of the prolog sequence, except
6648    that we need to insert barriers to avoid scheduling loads that read
6649    from a deallocated stack, and we optimize the unwind records by
6650    emitting them all together if possible.  */
6651 void
6652 aarch64_expand_epilogue (bool for_sibcall)
6653 {
6654   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6655   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6656   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6657   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6658   unsigned reg1 = cfun->machine->frame.wb_candidate1;
6659   unsigned reg2 = cfun->machine->frame.wb_candidate2;
6660   rtx cfi_ops = NULL;
6661   rtx_insn *insn;
6662   /* A stack clash protection prologue may not have left EP0_REGNUM or
6663      EP1_REGNUM in a usable state.  The same is true for allocations
6664      with an SVE component, since we then need both temporary registers
6665      for each allocation.  For stack clash we are in a usable state if
6666      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
6667   HOST_WIDE_INT guard_size
6668     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6669   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6670
6671   /* We can re-use the registers when the allocation amount is smaller than
6672      guard_size - guard_used_by_caller because we won't be doing any probes
6673      then.  In such situations the register should remain live with the correct
6674      value.  */
6675   bool can_inherit_p = (initial_adjust.is_constant ()
6676                         && final_adjust.is_constant ())
6677                         && (!flag_stack_clash_protection
6678                             || known_lt (initial_adjust,
6679                                          guard_size - guard_used_by_caller));
6680
6681   /* We need to add memory barrier to prevent read from deallocated stack.  */
6682   bool need_barrier_p
6683     = maybe_ne (get_frame_size ()
6684                 + cfun->machine->frame.saved_varargs_size, 0);
6685
6686   /* Emit a barrier to prevent loads from a deallocated stack.  */
6687   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
6688       || cfun->calls_alloca
6689       || crtl->calls_eh_return)
6690     {
6691       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6692       need_barrier_p = false;
6693     }
6694
6695   /* Restore the stack pointer from the frame pointer if it may not
6696      be the same as the stack pointer.  */
6697   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6698   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6699   if (frame_pointer_needed
6700       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
6701     /* If writeback is used when restoring callee-saves, the CFA
6702        is restored on the instruction doing the writeback.  */
6703     aarch64_add_offset (Pmode, stack_pointer_rtx,
6704                         hard_frame_pointer_rtx, -callee_offset,
6705                         tmp1_rtx, tmp0_rtx, callee_adjust == 0);
6706   else
6707      /* The case where we need to re-use the register here is very rare, so
6708         avoid the complicated condition and just always emit a move if the
6709         immediate doesn't fit.  */
6710      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
6711
6712   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6713                                 callee_adjust != 0, &cfi_ops);
6714   if (aarch64_simd_decl_p (cfun->decl))
6715     aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6716                                   callee_adjust != 0, &cfi_ops);
6717   else
6718     aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6719                                   callee_adjust != 0, &cfi_ops);
6720
6721   if (need_barrier_p)
6722     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6723
6724   if (callee_adjust != 0)
6725     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
6726
6727   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
6728     {
6729       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
6730       insn = get_last_insn ();
6731       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
6732       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
6733       RTX_FRAME_RELATED_P (insn) = 1;
6734       cfi_ops = NULL;
6735     }
6736
6737   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
6738      add restriction on emit_move optimization to leaf functions.  */
6739   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
6740                   (!can_inherit_p || !crtl->is_leaf
6741                    || df_regs_ever_live_p (EP0_REGNUM)));
6742
6743   if (cfi_ops)
6744     {
6745       /* Emit delayed restores and reset the CFA to be SP.  */
6746       insn = get_last_insn ();
6747       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
6748       REG_NOTES (insn) = cfi_ops;
6749       RTX_FRAME_RELATED_P (insn) = 1;
6750     }
6751
6752   /* We prefer to emit the combined return/authenticate instruction RETAA,
6753      however there are three cases in which we must instead emit an explicit
6754      authentication instruction.
6755
6756         1) Sibcalls don't return in a normal way, so if we're about to call one
6757            we must authenticate.
6758
6759         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
6760            generating code for !TARGET_ARMV8_3 we can't use it and must
6761            explicitly authenticate.
6762
6763         3) On an eh_return path we make extra stack adjustments to update the
6764            canonical frame address to be the exception handler's CFA.  We want
6765            to authenticate using the CFA of the function which calls eh_return.
6766     */
6767   if (aarch64_return_address_signing_enabled ()
6768       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
6769     {
6770       switch (aarch64_ra_sign_key)
6771         {
6772           case AARCH64_KEY_A:
6773             insn = emit_insn (gen_autiasp ());
6774             break;
6775           case AARCH64_KEY_B:
6776             insn = emit_insn (gen_autibsp ());
6777             break;
6778           default:
6779             gcc_unreachable ();
6780         }
6781       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6782       RTX_FRAME_RELATED_P (insn) = 1;
6783     }
6784
6785   /* Stack adjustment for exception handler.  */
6786   if (crtl->calls_eh_return && !for_sibcall)
6787     {
6788       /* We need to unwind the stack by the offset computed by
6789          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
6790          to be SP; letting the CFA move during this adjustment
6791          is just as correct as retaining the CFA from the body
6792          of the function.  Therefore, do nothing special.  */
6793       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
6794     }
6795
6796   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
6797   if (!for_sibcall)
6798     emit_jump_insn (ret_rtx);
6799 }
6800
6801 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
6802    normally or return to a previous frame after unwinding.
6803
6804    An EH return uses a single shared return sequence.  The epilogue is
6805    exactly like a normal epilogue except that it has an extra input
6806    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
6807    that must be applied after the frame has been destroyed.  An extra label
6808    is inserted before the epilogue which initializes this register to zero,
6809    and this is the entry point for a normal return.
6810
6811    An actual EH return updates the return address, initializes the stack
6812    adjustment and jumps directly into the epilogue (bypassing the zeroing
6813    of the adjustment).  Since the return address is typically saved on the
6814    stack when a function makes a call, the saved LR must be updated outside
6815    the epilogue.
6816
6817    This poses problems as the store is generated well before the epilogue,
6818    so the offset of LR is not known yet.  Also optimizations will remove the
6819    store as it appears dead, even after the epilogue is generated (as the
6820    base or offset for loading LR is different in many cases).
6821
6822    To avoid these problems this implementation forces the frame pointer
6823    in eh_return functions so that the location of LR is fixed and known early.
6824    It also marks the store volatile, so no optimization is permitted to
6825    remove the store.  */
6826 rtx
6827 aarch64_eh_return_handler_rtx (void)
6828 {
6829   rtx tmp = gen_frame_mem (Pmode,
6830     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
6831
6832   /* Mark the store volatile, so no optimization is permitted to remove it.  */
6833   MEM_VOLATILE_P (tmp) = true;
6834   return tmp;
6835 }
6836
6837 /* Output code to add DELTA to the first argument, and then jump
6838    to FUNCTION.  Used for C++ multiple inheritance.  */
6839 static void
6840 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
6841                          HOST_WIDE_INT delta,
6842                          HOST_WIDE_INT vcall_offset,
6843                          tree function)
6844 {
6845   /* The this pointer is always in x0.  Note that this differs from
6846      Arm where the this pointer maybe bumped to r1 if r0 is required
6847      to return a pointer to an aggregate.  On AArch64 a result value
6848      pointer will be in x8.  */
6849   int this_regno = R0_REGNUM;
6850   rtx this_rtx, temp0, temp1, addr, funexp;
6851   rtx_insn *insn;
6852   const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
6853
6854   if (aarch64_bti_enabled ())
6855     emit_insn (gen_bti_c());
6856
6857   reload_completed = 1;
6858   emit_note (NOTE_INSN_PROLOGUE_END);
6859
6860   this_rtx = gen_rtx_REG (Pmode, this_regno);
6861   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
6862   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
6863
6864   if (vcall_offset == 0)
6865     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
6866   else
6867     {
6868       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
6869
6870       addr = this_rtx;
6871       if (delta != 0)
6872         {
6873           if (delta >= -256 && delta < 256)
6874             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
6875                                        plus_constant (Pmode, this_rtx, delta));
6876           else
6877             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
6878                                 temp1, temp0, false);
6879         }
6880
6881       if (Pmode == ptr_mode)
6882         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
6883       else
6884         aarch64_emit_move (temp0,
6885                            gen_rtx_ZERO_EXTEND (Pmode,
6886                                                 gen_rtx_MEM (ptr_mode, addr)));
6887
6888       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
6889           addr = plus_constant (Pmode, temp0, vcall_offset);
6890       else
6891         {
6892           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
6893                                           Pmode);
6894           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
6895         }
6896
6897       if (Pmode == ptr_mode)
6898         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
6899       else
6900         aarch64_emit_move (temp1,
6901                            gen_rtx_SIGN_EXTEND (Pmode,
6902                                                 gen_rtx_MEM (ptr_mode, addr)));
6903
6904       emit_insn (gen_add2_insn (this_rtx, temp1));
6905     }
6906
6907   /* Generate a tail call to the target function.  */
6908   if (!TREE_USED (function))
6909     {
6910       assemble_external (function);
6911       TREE_USED (function) = 1;
6912     }
6913   funexp = XEXP (DECL_RTL (function), 0);
6914   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6915   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
6916   SIBLING_CALL_P (insn) = 1;
6917
6918   insn = get_insns ();
6919   shorten_branches (insn);
6920
6921   assemble_start_function (thunk, fnname);
6922   final_start_function (insn, file, 1);
6923   final (insn, file, 1);
6924   final_end_function ();
6925   assemble_end_function (thunk, fnname);
6926
6927   /* Stop pretending to be a post-reload pass.  */
6928   reload_completed = 0;
6929 }
6930
6931 static bool
6932 aarch64_tls_referenced_p (rtx x)
6933 {
6934   if (!TARGET_HAVE_TLS)
6935     return false;
6936   subrtx_iterator::array_type array;
6937   FOR_EACH_SUBRTX (iter, array, x, ALL)
6938     {
6939       const_rtx x = *iter;
6940       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6941         return true;
6942       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6943          TLS offsets, not real symbol references.  */
6944       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6945         iter.skip_subrtxes ();
6946     }
6947   return false;
6948 }
6949
6950
6951 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6952    a left shift of 0 or 12 bits.  */
6953 bool
6954 aarch64_uimm12_shift (HOST_WIDE_INT val)
6955 {
6956   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6957           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6958           );
6959 }
6960
6961 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6962    that can be created with a left shift of 0 or 12.  */
6963 static HOST_WIDE_INT
6964 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6965 {
6966   /* Check to see if the value fits in 24 bits, as that is the maximum we can
6967      handle correctly.  */
6968   gcc_assert ((val & 0xffffff) == val);
6969
6970   if (((val & 0xfff) << 0) == val)
6971     return val;
6972
6973   return val & (0xfff << 12);
6974 }
6975
6976 /* Return true if val is an immediate that can be loaded into a
6977    register by a MOVZ instruction.  */
6978 static bool
6979 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6980 {
6981   if (GET_MODE_SIZE (mode) > 4)
6982     {
6983       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6984           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6985         return 1;
6986     }
6987   else
6988     {
6989       /* Ignore sign extension.  */
6990       val &= (HOST_WIDE_INT) 0xffffffff;
6991     }
6992   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6993           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6994 }
6995
6996 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
6997    64-bit (DImode) integer.  */
6998
6999 static unsigned HOST_WIDE_INT
7000 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
7001 {
7002   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
7003   while (size < 64)
7004     {
7005       val &= (HOST_WIDE_INT_1U << size) - 1;
7006       val |= val << size;
7007       size *= 2;
7008     }
7009   return val;
7010 }
7011
7012 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
7013
7014 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
7015   {
7016     0x0000000100000001ull,
7017     0x0001000100010001ull,
7018     0x0101010101010101ull,
7019     0x1111111111111111ull,
7020     0x5555555555555555ull,
7021   };
7022
7023
7024 /* Return true if val is a valid bitmask immediate.  */
7025
7026 bool
7027 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
7028 {
7029   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
7030   int bits;
7031
7032   /* Check for a single sequence of one bits and return quickly if so.
7033      The special cases of all ones and all zeroes returns false.  */
7034   val = aarch64_replicate_bitmask_imm (val_in, mode);
7035   tmp = val + (val & -val);
7036
7037   if (tmp == (tmp & -tmp))
7038     return (val + 1) > 1;
7039
7040   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
7041   if (mode == SImode)
7042     val = (val << 32) | (val & 0xffffffff);
7043
7044   /* Invert if the immediate doesn't start with a zero bit - this means we
7045      only need to search for sequences of one bits.  */
7046   if (val & 1)
7047     val = ~val;
7048
7049   /* Find the first set bit and set tmp to val with the first sequence of one
7050      bits removed.  Return success if there is a single sequence of ones.  */
7051   first_one = val & -val;
7052   tmp = val & (val + first_one);
7053
7054   if (tmp == 0)
7055     return true;
7056
7057   /* Find the next set bit and compute the difference in bit position.  */
7058   next_one = tmp & -tmp;
7059   bits = clz_hwi (first_one) - clz_hwi (next_one);
7060   mask = val ^ tmp;
7061
7062   /* Check the bit position difference is a power of 2, and that the first
7063      sequence of one bits fits within 'bits' bits.  */
7064   if ((mask >> bits) != 0 || bits != (bits & -bits))
7065     return false;
7066
7067   /* Check the sequence of one bits is repeated 64/bits times.  */
7068   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
7069 }
7070
7071 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
7072    Assumed precondition: VAL_IN Is not zero.  */
7073
7074 unsigned HOST_WIDE_INT
7075 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
7076 {
7077   int lowest_bit_set = ctz_hwi (val_in);
7078   int highest_bit_set = floor_log2 (val_in);
7079   gcc_assert (val_in != 0);
7080
7081   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
7082           (HOST_WIDE_INT_1U << lowest_bit_set));
7083 }
7084
7085 /* Create constant where bits outside of lowest bit set to highest bit set
7086    are set to 1.  */
7087
7088 unsigned HOST_WIDE_INT
7089 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
7090 {
7091   return val_in | ~aarch64_and_split_imm1 (val_in);
7092 }
7093
7094 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
7095
7096 bool
7097 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
7098 {
7099   scalar_int_mode int_mode;
7100   if (!is_a <scalar_int_mode> (mode, &int_mode))
7101     return false;
7102
7103   if (aarch64_bitmask_imm (val_in, int_mode))
7104     return false;
7105
7106   if (aarch64_move_imm (val_in, int_mode))
7107     return false;
7108
7109   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
7110
7111   return aarch64_bitmask_imm (imm2, int_mode);
7112 }
7113
7114 /* Return true if val is an immediate that can be loaded into a
7115    register in a single instruction.  */
7116 bool
7117 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
7118 {
7119   scalar_int_mode int_mode;
7120   if (!is_a <scalar_int_mode> (mode, &int_mode))
7121     return false;
7122
7123   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
7124     return 1;
7125   return aarch64_bitmask_imm (val, int_mode);
7126 }
7127
7128 static bool
7129 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
7130 {
7131   rtx base, offset;
7132
7133   if (GET_CODE (x) == HIGH)
7134     return true;
7135
7136   /* There's no way to calculate VL-based values using relocations.  */
7137   subrtx_iterator::array_type array;
7138   FOR_EACH_SUBRTX (iter, array, x, ALL)
7139     if (GET_CODE (*iter) == CONST_POLY_INT)
7140       return true;
7141
7142   split_const (x, &base, &offset);
7143   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
7144     {
7145       if (aarch64_classify_symbol (base, INTVAL (offset))
7146           != SYMBOL_FORCE_TO_MEM)
7147         return true;
7148       else
7149         /* Avoid generating a 64-bit relocation in ILP32; leave
7150            to aarch64_expand_mov_immediate to handle it properly.  */
7151         return mode != ptr_mode;
7152     }
7153
7154   return aarch64_tls_referenced_p (x);
7155 }
7156
7157 /* Implement TARGET_CASE_VALUES_THRESHOLD.
7158    The expansion for a table switch is quite expensive due to the number
7159    of instructions, the table lookup and hard to predict indirect jump.
7160    When optimizing for speed, and -O3 enabled, use the per-core tuning if
7161    set, otherwise use tables for > 16 cases as a tradeoff between size and
7162    performance.  When optimizing for size, use the default setting.  */
7163
7164 static unsigned int
7165 aarch64_case_values_threshold (void)
7166 {
7167   /* Use the specified limit for the number of cases before using jump
7168      tables at higher optimization levels.  */
7169   if (optimize > 2
7170       && selected_cpu->tune->max_case_values != 0)
7171     return selected_cpu->tune->max_case_values;
7172   else
7173     return optimize_size ? default_case_values_threshold () : 17;
7174 }
7175
7176 /* Return true if register REGNO is a valid index register.
7177    STRICT_P is true if REG_OK_STRICT is in effect.  */
7178
7179 bool
7180 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
7181 {
7182   if (!HARD_REGISTER_NUM_P (regno))
7183     {
7184       if (!strict_p)
7185         return true;
7186
7187       if (!reg_renumber)
7188         return false;
7189
7190       regno = reg_renumber[regno];
7191     }
7192   return GP_REGNUM_P (regno);
7193 }
7194
7195 /* Return true if register REGNO is a valid base register for mode MODE.
7196    STRICT_P is true if REG_OK_STRICT is in effect.  */
7197
7198 bool
7199 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
7200 {
7201   if (!HARD_REGISTER_NUM_P (regno))
7202     {
7203       if (!strict_p)
7204         return true;
7205
7206       if (!reg_renumber)
7207         return false;
7208
7209       regno = reg_renumber[regno];
7210     }
7211
7212   /* The fake registers will be eliminated to either the stack or
7213      hard frame pointer, both of which are usually valid base registers.
7214      Reload deals with the cases where the eliminated form isn't valid.  */
7215   return (GP_REGNUM_P (regno)
7216           || regno == SP_REGNUM
7217           || regno == FRAME_POINTER_REGNUM
7218           || regno == ARG_POINTER_REGNUM);
7219 }
7220
7221 /* Return true if X is a valid base register for mode MODE.
7222    STRICT_P is true if REG_OK_STRICT is in effect.  */
7223
7224 static bool
7225 aarch64_base_register_rtx_p (rtx x, bool strict_p)
7226 {
7227   if (!strict_p
7228       && GET_CODE (x) == SUBREG
7229       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
7230     x = SUBREG_REG (x);
7231
7232   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
7233 }
7234
7235 /* Return true if address offset is a valid index.  If it is, fill in INFO
7236    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
7237
7238 static bool
7239 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
7240                         machine_mode mode, bool strict_p)
7241 {
7242   enum aarch64_address_type type;
7243   rtx index;
7244   int shift;
7245
7246   /* (reg:P) */
7247   if ((REG_P (x) || GET_CODE (x) == SUBREG)
7248       && GET_MODE (x) == Pmode)
7249     {
7250       type = ADDRESS_REG_REG;
7251       index = x;
7252       shift = 0;
7253     }
7254   /* (sign_extend:DI (reg:SI)) */
7255   else if ((GET_CODE (x) == SIGN_EXTEND
7256             || GET_CODE (x) == ZERO_EXTEND)
7257            && GET_MODE (x) == DImode
7258            && GET_MODE (XEXP (x, 0)) == SImode)
7259     {
7260       type = (GET_CODE (x) == SIGN_EXTEND)
7261         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7262       index = XEXP (x, 0);
7263       shift = 0;
7264     }
7265   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
7266   else if (GET_CODE (x) == MULT
7267            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7268                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7269            && GET_MODE (XEXP (x, 0)) == DImode
7270            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7271            && CONST_INT_P (XEXP (x, 1)))
7272     {
7273       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7274         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7275       index = XEXP (XEXP (x, 0), 0);
7276       shift = exact_log2 (INTVAL (XEXP (x, 1)));
7277     }
7278   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
7279   else if (GET_CODE (x) == ASHIFT
7280            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7281                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7282            && GET_MODE (XEXP (x, 0)) == DImode
7283            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7284            && CONST_INT_P (XEXP (x, 1)))
7285     {
7286       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7287         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7288       index = XEXP (XEXP (x, 0), 0);
7289       shift = INTVAL (XEXP (x, 1));
7290     }
7291   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
7292   else if ((GET_CODE (x) == SIGN_EXTRACT
7293             || GET_CODE (x) == ZERO_EXTRACT)
7294            && GET_MODE (x) == DImode
7295            && GET_CODE (XEXP (x, 0)) == MULT
7296            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7297            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7298     {
7299       type = (GET_CODE (x) == SIGN_EXTRACT)
7300         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7301       index = XEXP (XEXP (x, 0), 0);
7302       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7303       if (INTVAL (XEXP (x, 1)) != 32 + shift
7304           || INTVAL (XEXP (x, 2)) != 0)
7305         shift = -1;
7306     }
7307   /* (and:DI (mult:DI (reg:DI) (const_int scale))
7308      (const_int 0xffffffff<<shift)) */
7309   else if (GET_CODE (x) == AND
7310            && GET_MODE (x) == DImode
7311            && GET_CODE (XEXP (x, 0)) == MULT
7312            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7313            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7314            && CONST_INT_P (XEXP (x, 1)))
7315     {
7316       type = ADDRESS_REG_UXTW;
7317       index = XEXP (XEXP (x, 0), 0);
7318       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7319       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7320         shift = -1;
7321     }
7322   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
7323   else if ((GET_CODE (x) == SIGN_EXTRACT
7324             || GET_CODE (x) == ZERO_EXTRACT)
7325            && GET_MODE (x) == DImode
7326            && GET_CODE (XEXP (x, 0)) == ASHIFT
7327            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7328            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7329     {
7330       type = (GET_CODE (x) == SIGN_EXTRACT)
7331         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7332       index = XEXP (XEXP (x, 0), 0);
7333       shift = INTVAL (XEXP (XEXP (x, 0), 1));
7334       if (INTVAL (XEXP (x, 1)) != 32 + shift
7335           || INTVAL (XEXP (x, 2)) != 0)
7336         shift = -1;
7337     }
7338   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
7339      (const_int 0xffffffff<<shift)) */
7340   else if (GET_CODE (x) == AND
7341            && GET_MODE (x) == DImode
7342            && GET_CODE (XEXP (x, 0)) == ASHIFT
7343            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7344            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7345            && CONST_INT_P (XEXP (x, 1)))
7346     {
7347       type = ADDRESS_REG_UXTW;
7348       index = XEXP (XEXP (x, 0), 0);
7349       shift = INTVAL (XEXP (XEXP (x, 0), 1));
7350       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7351         shift = -1;
7352     }
7353   /* (mult:P (reg:P) (const_int scale)) */
7354   else if (GET_CODE (x) == MULT
7355            && GET_MODE (x) == Pmode
7356            && GET_MODE (XEXP (x, 0)) == Pmode
7357            && CONST_INT_P (XEXP (x, 1)))
7358     {
7359       type = ADDRESS_REG_REG;
7360       index = XEXP (x, 0);
7361       shift = exact_log2 (INTVAL (XEXP (x, 1)));
7362     }
7363   /* (ashift:P (reg:P) (const_int shift)) */
7364   else if (GET_CODE (x) == ASHIFT
7365            && GET_MODE (x) == Pmode
7366            && GET_MODE (XEXP (x, 0)) == Pmode
7367            && CONST_INT_P (XEXP (x, 1)))
7368     {
7369       type = ADDRESS_REG_REG;
7370       index = XEXP (x, 0);
7371       shift = INTVAL (XEXP (x, 1));
7372     }
7373   else
7374     return false;
7375
7376   if (!strict_p
7377       && GET_CODE (index) == SUBREG
7378       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
7379     index = SUBREG_REG (index);
7380
7381   if (aarch64_sve_data_mode_p (mode))
7382     {
7383       if (type != ADDRESS_REG_REG
7384           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
7385         return false;
7386     }
7387   else
7388     {
7389       if (shift != 0
7390           && !(IN_RANGE (shift, 1, 3)
7391                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
7392         return false;
7393     }
7394
7395   if (REG_P (index)
7396       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
7397     {
7398       info->type = type;
7399       info->offset = index;
7400       info->shift = shift;
7401       return true;
7402     }
7403
7404   return false;
7405 }
7406
7407 /* Return true if MODE is one of the modes for which we
7408    support LDP/STP operations.  */
7409
7410 static bool
7411 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
7412 {
7413   return mode == SImode || mode == DImode
7414          || mode == SFmode || mode == DFmode
7415          || (aarch64_vector_mode_supported_p (mode)
7416              && (known_eq (GET_MODE_SIZE (mode), 8)
7417                  || (known_eq (GET_MODE_SIZE (mode), 16)
7418                     && (aarch64_tune_params.extra_tuning_flags
7419                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
7420 }
7421
7422 /* Return true if REGNO is a virtual pointer register, or an eliminable
7423    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
7424    include stack_pointer or hard_frame_pointer.  */
7425 static bool
7426 virt_or_elim_regno_p (unsigned regno)
7427 {
7428   return ((regno >= FIRST_VIRTUAL_REGISTER
7429            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
7430           || regno == FRAME_POINTER_REGNUM
7431           || regno == ARG_POINTER_REGNUM);
7432 }
7433
7434 /* Return true if X is a valid address of type TYPE for machine mode MODE.
7435    If it is, fill in INFO appropriately.  STRICT_P is true if
7436    REG_OK_STRICT is in effect.  */
7437
7438 bool
7439 aarch64_classify_address (struct aarch64_address_info *info,
7440                           rtx x, machine_mode mode, bool strict_p,
7441                           aarch64_addr_query_type type)
7442 {
7443   enum rtx_code code = GET_CODE (x);
7444   rtx op0, op1;
7445   poly_int64 offset;
7446
7447   HOST_WIDE_INT const_size;
7448
7449   /* On BE, we use load/store pair for all large int mode load/stores.
7450      TI/TFmode may also use a load/store pair.  */
7451   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7452   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
7453   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
7454                             || type == ADDR_QUERY_LDP_STP_N
7455                             || mode == TImode
7456                             || mode == TFmode
7457                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
7458
7459   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
7460      corresponds to the actual size of the memory being loaded/stored and the
7461      mode of the corresponding addressing mode is half of that.  */
7462   if (type == ADDR_QUERY_LDP_STP_N
7463       && known_eq (GET_MODE_SIZE (mode), 16))
7464     mode = DFmode;
7465
7466   bool allow_reg_index_p = (!load_store_pair_p
7467                             && (known_lt (GET_MODE_SIZE (mode), 16)
7468                                 || vec_flags == VEC_ADVSIMD
7469                                 || vec_flags & VEC_SVE_DATA));
7470
7471   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
7472      [Rn, #offset, MUL VL].  */
7473   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
7474       && (code != REG && code != PLUS))
7475     return false;
7476
7477   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
7478      REG addressing.  */
7479   if (advsimd_struct_p
7480       && !BYTES_BIG_ENDIAN
7481       && (code != POST_INC && code != REG))
7482     return false;
7483
7484   gcc_checking_assert (GET_MODE (x) == VOIDmode
7485                        || SCALAR_INT_MODE_P (GET_MODE (x)));
7486
7487   switch (code)
7488     {
7489     case REG:
7490     case SUBREG:
7491       info->type = ADDRESS_REG_IMM;
7492       info->base = x;
7493       info->offset = const0_rtx;
7494       info->const_offset = 0;
7495       return aarch64_base_register_rtx_p (x, strict_p);
7496
7497     case PLUS:
7498       op0 = XEXP (x, 0);
7499       op1 = XEXP (x, 1);
7500
7501       if (! strict_p
7502           && REG_P (op0)
7503           && virt_or_elim_regno_p (REGNO (op0))
7504           && poly_int_rtx_p (op1, &offset))
7505         {
7506           info->type = ADDRESS_REG_IMM;
7507           info->base = op0;
7508           info->offset = op1;
7509           info->const_offset = offset;
7510
7511           return true;
7512         }
7513
7514       if (maybe_ne (GET_MODE_SIZE (mode), 0)
7515           && aarch64_base_register_rtx_p (op0, strict_p)
7516           && poly_int_rtx_p (op1, &offset))
7517         {
7518           info->type = ADDRESS_REG_IMM;
7519           info->base = op0;
7520           info->offset = op1;
7521           info->const_offset = offset;
7522
7523           /* TImode and TFmode values are allowed in both pairs of X
7524              registers and individual Q registers.  The available
7525              address modes are:
7526              X,X: 7-bit signed scaled offset
7527              Q:   9-bit signed offset
7528              We conservatively require an offset representable in either mode.
7529              When performing the check for pairs of X registers i.e.  LDP/STP
7530              pass down DImode since that is the natural size of the LDP/STP
7531              instruction memory accesses.  */
7532           if (mode == TImode || mode == TFmode)
7533             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
7534                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7535                         || offset_12bit_unsigned_scaled_p (mode, offset)));
7536
7537           /* A 7bit offset check because OImode will emit a ldp/stp
7538              instruction (only big endian will get here).
7539              For ldp/stp instructions, the offset is scaled for the size of a
7540              single element of the pair.  */
7541           if (mode == OImode)
7542             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
7543
7544           /* Three 9/12 bit offsets checks because CImode will emit three
7545              ldr/str instructions (only big endian will get here).  */
7546           if (mode == CImode)
7547             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7548                     && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
7549                                                                offset + 32)
7550                         || offset_12bit_unsigned_scaled_p (V16QImode,
7551                                                            offset + 32)));
7552
7553           /* Two 7bit offsets checks because XImode will emit two ldp/stp
7554              instructions (only big endian will get here).  */
7555           if (mode == XImode)
7556             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7557                     && aarch64_offset_7bit_signed_scaled_p (TImode,
7558                                                             offset + 32));
7559
7560           /* Make "m" use the LD1 offset range for SVE data modes, so
7561              that pre-RTL optimizers like ivopts will work to that
7562              instead of the wider LDR/STR range.  */
7563           if (vec_flags == VEC_SVE_DATA)
7564             return (type == ADDR_QUERY_M
7565                     ? offset_4bit_signed_scaled_p (mode, offset)
7566                     : offset_9bit_signed_scaled_p (mode, offset));
7567
7568           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
7569             {
7570               poly_int64 end_offset = (offset
7571                                        + GET_MODE_SIZE (mode)
7572                                        - BYTES_PER_SVE_VECTOR);
7573               return (type == ADDR_QUERY_M
7574                       ? offset_4bit_signed_scaled_p (mode, offset)
7575                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
7576                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
7577                                                          end_offset)));
7578             }
7579
7580           if (vec_flags == VEC_SVE_PRED)
7581             return offset_9bit_signed_scaled_p (mode, offset);
7582
7583           if (load_store_pair_p)
7584             return ((known_eq (GET_MODE_SIZE (mode), 4)
7585                      || known_eq (GET_MODE_SIZE (mode), 8)
7586                      || known_eq (GET_MODE_SIZE (mode), 16))
7587                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7588           else
7589             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7590                     || offset_12bit_unsigned_scaled_p (mode, offset));
7591         }
7592
7593       if (allow_reg_index_p)
7594         {
7595           /* Look for base + (scaled/extended) index register.  */
7596           if (aarch64_base_register_rtx_p (op0, strict_p)
7597               && aarch64_classify_index (info, op1, mode, strict_p))
7598             {
7599               info->base = op0;
7600               return true;
7601             }
7602           if (aarch64_base_register_rtx_p (op1, strict_p)
7603               && aarch64_classify_index (info, op0, mode, strict_p))
7604             {
7605               info->base = op1;
7606               return true;
7607             }
7608         }
7609
7610       return false;
7611
7612     case POST_INC:
7613     case POST_DEC:
7614     case PRE_INC:
7615     case PRE_DEC:
7616       info->type = ADDRESS_REG_WB;
7617       info->base = XEXP (x, 0);
7618       info->offset = NULL_RTX;
7619       return aarch64_base_register_rtx_p (info->base, strict_p);
7620
7621     case POST_MODIFY:
7622     case PRE_MODIFY:
7623       info->type = ADDRESS_REG_WB;
7624       info->base = XEXP (x, 0);
7625       if (GET_CODE (XEXP (x, 1)) == PLUS
7626           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
7627           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
7628           && aarch64_base_register_rtx_p (info->base, strict_p))
7629         {
7630           info->offset = XEXP (XEXP (x, 1), 1);
7631           info->const_offset = offset;
7632
7633           /* TImode and TFmode values are allowed in both pairs of X
7634              registers and individual Q registers.  The available
7635              address modes are:
7636              X,X: 7-bit signed scaled offset
7637              Q:   9-bit signed offset
7638              We conservatively require an offset representable in either mode.
7639            */
7640           if (mode == TImode || mode == TFmode)
7641             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
7642                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
7643
7644           if (load_store_pair_p)
7645             return ((known_eq (GET_MODE_SIZE (mode), 4)
7646                      || known_eq (GET_MODE_SIZE (mode), 8)
7647                      || known_eq (GET_MODE_SIZE (mode), 16))
7648                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7649           else
7650             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
7651         }
7652       return false;
7653
7654     case CONST:
7655     case SYMBOL_REF:
7656     case LABEL_REF:
7657       /* load literal: pc-relative constant pool entry.  Only supported
7658          for SI mode or larger.  */
7659       info->type = ADDRESS_SYMBOLIC;
7660
7661       if (!load_store_pair_p
7662           && GET_MODE_SIZE (mode).is_constant (&const_size)
7663           && const_size >= 4)
7664         {
7665           rtx sym, addend;
7666
7667           split_const (x, &sym, &addend);
7668           return ((GET_CODE (sym) == LABEL_REF
7669                    || (GET_CODE (sym) == SYMBOL_REF
7670                        && CONSTANT_POOL_ADDRESS_P (sym)
7671                        && aarch64_pcrelative_literal_loads)));
7672         }
7673       return false;
7674
7675     case LO_SUM:
7676       info->type = ADDRESS_LO_SUM;
7677       info->base = XEXP (x, 0);
7678       info->offset = XEXP (x, 1);
7679       if (allow_reg_index_p
7680           && aarch64_base_register_rtx_p (info->base, strict_p))
7681         {
7682           rtx sym, offs;
7683           split_const (info->offset, &sym, &offs);
7684           if (GET_CODE (sym) == SYMBOL_REF
7685               && (aarch64_classify_symbol (sym, INTVAL (offs))
7686                   == SYMBOL_SMALL_ABSOLUTE))
7687             {
7688               /* The symbol and offset must be aligned to the access size.  */
7689               unsigned int align;
7690
7691               if (CONSTANT_POOL_ADDRESS_P (sym))
7692                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
7693               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
7694                 {
7695                   tree exp = SYMBOL_REF_DECL (sym);
7696                   align = TYPE_ALIGN (TREE_TYPE (exp));
7697                   align = aarch64_constant_alignment (exp, align);
7698                 }
7699               else if (SYMBOL_REF_DECL (sym))
7700                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
7701               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
7702                        && SYMBOL_REF_BLOCK (sym) != NULL)
7703                 align = SYMBOL_REF_BLOCK (sym)->alignment;
7704               else
7705                 align = BITS_PER_UNIT;
7706
7707               poly_int64 ref_size = GET_MODE_SIZE (mode);
7708               if (known_eq (ref_size, 0))
7709                 ref_size = GET_MODE_SIZE (DImode);
7710
7711               return (multiple_p (INTVAL (offs), ref_size)
7712                       && multiple_p (align / BITS_PER_UNIT, ref_size));
7713             }
7714         }
7715       return false;
7716
7717     default:
7718       return false;
7719     }
7720 }
7721
7722 /* Return true if the address X is valid for a PRFM instruction.
7723    STRICT_P is true if we should do strict checking with
7724    aarch64_classify_address.  */
7725
7726 bool
7727 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
7728 {
7729   struct aarch64_address_info addr;
7730
7731   /* PRFM accepts the same addresses as DImode...  */
7732   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
7733   if (!res)
7734     return false;
7735
7736   /* ... except writeback forms.  */
7737   return addr.type != ADDRESS_REG_WB;
7738 }
7739
7740 bool
7741 aarch64_symbolic_address_p (rtx x)
7742 {
7743   rtx offset;
7744
7745   split_const (x, &x, &offset);
7746   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
7747 }
7748
7749 /* Classify the base of symbolic expression X.  */
7750
7751 enum aarch64_symbol_type
7752 aarch64_classify_symbolic_expression (rtx x)
7753 {
7754   rtx offset;
7755
7756   split_const (x, &x, &offset);
7757   return aarch64_classify_symbol (x, INTVAL (offset));
7758 }
7759
7760
7761 /* Return TRUE if X is a legitimate address for accessing memory in
7762    mode MODE.  */
7763 static bool
7764 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
7765 {
7766   struct aarch64_address_info addr;
7767
7768   return aarch64_classify_address (&addr, x, mode, strict_p);
7769 }
7770
7771 /* Return TRUE if X is a legitimate address of type TYPE for accessing
7772    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
7773 bool
7774 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
7775                               aarch64_addr_query_type type)
7776 {
7777   struct aarch64_address_info addr;
7778
7779   return aarch64_classify_address (&addr, x, mode, strict_p, type);
7780 }
7781
7782 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
7783
7784 static bool
7785 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
7786                                          poly_int64 orig_offset,
7787                                          machine_mode mode)
7788 {
7789   HOST_WIDE_INT size;
7790   if (GET_MODE_SIZE (mode).is_constant (&size))
7791     {
7792       HOST_WIDE_INT const_offset, second_offset;
7793
7794       /* A general SVE offset is A * VQ + B.  Remove the A component from
7795          coefficient 0 in order to get the constant B.  */
7796       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
7797
7798       /* Split an out-of-range address displacement into a base and
7799          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
7800          range otherwise to increase opportunities for sharing the base
7801          address of different sizes.  Unaligned accesses use the signed
7802          9-bit range, TImode/TFmode use the intersection of signed
7803          scaled 7-bit and signed 9-bit offset.  */
7804       if (mode == TImode || mode == TFmode)
7805         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
7806       else if ((const_offset & (size - 1)) != 0)
7807         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
7808       else
7809         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
7810
7811       if (second_offset == 0 || known_eq (orig_offset, second_offset))
7812         return false;
7813
7814       /* Split the offset into second_offset and the rest.  */
7815       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7816       *offset2 = gen_int_mode (second_offset, Pmode);
7817       return true;
7818     }
7819   else
7820     {
7821       /* Get the mode we should use as the basis of the range.  For structure
7822          modes this is the mode of one vector.  */
7823       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7824       machine_mode step_mode
7825         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
7826
7827       /* Get the "mul vl" multiplier we'd like to use.  */
7828       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
7829       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
7830       if (vec_flags & VEC_SVE_DATA)
7831         /* LDR supports a 9-bit range, but the move patterns for
7832            structure modes require all vectors to be in range of the
7833            same base.  The simplest way of accomodating that while still
7834            promoting reuse of anchor points between different modes is
7835            to use an 8-bit range unconditionally.  */
7836         vnum = ((vnum + 128) & 255) - 128;
7837       else
7838         /* Predicates are only handled singly, so we might as well use
7839            the full range.  */
7840         vnum = ((vnum + 256) & 511) - 256;
7841       if (vnum == 0)
7842         return false;
7843
7844       /* Convert the "mul vl" multiplier into a byte offset.  */
7845       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
7846       if (known_eq (second_offset, orig_offset))
7847         return false;
7848
7849       /* Split the offset into second_offset and the rest.  */
7850       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7851       *offset2 = gen_int_mode (second_offset, Pmode);
7852       return true;
7853     }
7854 }
7855
7856 /* Return the binary representation of floating point constant VALUE in INTVAL.
7857    If the value cannot be converted, return false without setting INTVAL.
7858    The conversion is done in the given MODE.  */
7859 bool
7860 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
7861 {
7862
7863   /* We make a general exception for 0.  */
7864   if (aarch64_float_const_zero_rtx_p (value))
7865     {
7866       *intval = 0;
7867       return true;
7868     }
7869
7870   scalar_float_mode mode;
7871   if (GET_CODE (value) != CONST_DOUBLE
7872       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
7873       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
7874       /* Only support up to DF mode.  */
7875       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
7876     return false;
7877
7878   unsigned HOST_WIDE_INT ival = 0;
7879
7880   long res[2];
7881   real_to_target (res,
7882                   CONST_DOUBLE_REAL_VALUE (value),
7883                   REAL_MODE_FORMAT (mode));
7884
7885   if (mode == DFmode)
7886     {
7887       int order = BYTES_BIG_ENDIAN ? 1 : 0;
7888       ival = zext_hwi (res[order], 32);
7889       ival |= (zext_hwi (res[1 - order], 32) << 32);
7890     }
7891   else
7892       ival = zext_hwi (res[0], 32);
7893
7894   *intval = ival;
7895   return true;
7896 }
7897
7898 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7899    single MOV(+MOVK) followed by an FMOV.  */
7900 bool
7901 aarch64_float_const_rtx_p (rtx x)
7902 {
7903   machine_mode mode = GET_MODE (x);
7904   if (mode == VOIDmode)
7905     return false;
7906
7907   /* Determine whether it's cheaper to write float constants as
7908      mov/movk pairs over ldr/adrp pairs.  */
7909   unsigned HOST_WIDE_INT ival;
7910
7911   if (GET_CODE (x) == CONST_DOUBLE
7912       && SCALAR_FLOAT_MODE_P (mode)
7913       && aarch64_reinterpret_float_as_int (x, &ival))
7914     {
7915       scalar_int_mode imode = (mode == HFmode
7916                                ? SImode
7917                                : int_mode_for_mode (mode).require ());
7918       int num_instr = aarch64_internal_mov_immediate
7919                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7920       return num_instr < 3;
7921     }
7922
7923   return false;
7924 }
7925
7926 /* Return TRUE if rtx X is immediate constant 0.0 */
7927 bool
7928 aarch64_float_const_zero_rtx_p (rtx x)
7929 {
7930   if (GET_MODE (x) == VOIDmode)
7931     return false;
7932
7933   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7934     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7935   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7936 }
7937
7938 /* Return TRUE if rtx X is immediate constant that fits in a single
7939    MOVI immediate operation.  */
7940 bool
7941 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7942 {
7943   if (!TARGET_SIMD)
7944      return false;
7945
7946   machine_mode vmode;
7947   scalar_int_mode imode;
7948   unsigned HOST_WIDE_INT ival;
7949
7950   if (GET_CODE (x) == CONST_DOUBLE
7951       && SCALAR_FLOAT_MODE_P (mode))
7952     {
7953       if (!aarch64_reinterpret_float_as_int (x, &ival))
7954         return false;
7955
7956       /* We make a general exception for 0.  */
7957       if (aarch64_float_const_zero_rtx_p (x))
7958         return true;
7959
7960       imode = int_mode_for_mode (mode).require ();
7961     }
7962   else if (GET_CODE (x) == CONST_INT
7963            && is_a <scalar_int_mode> (mode, &imode))
7964     ival = INTVAL (x);
7965   else
7966     return false;
7967
7968    /* use a 64 bit mode for everything except for DI/DF mode, where we use
7969      a 128 bit vector mode.  */
7970   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7971
7972   vmode = aarch64_simd_container_mode (imode, width);
7973   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7974
7975   return aarch64_simd_valid_immediate (v_op, NULL);
7976 }
7977
7978
7979 /* Return the fixed registers used for condition codes.  */
7980
7981 static bool
7982 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7983 {
7984   *p1 = CC_REGNUM;
7985   *p2 = INVALID_REGNUM;
7986   return true;
7987 }
7988
7989 /* This function is used by the call expanders of the machine description.
7990    RESULT is the register in which the result is returned.  It's NULL for
7991    "call" and "sibcall".
7992    MEM is the location of the function call.
7993    SIBCALL indicates whether this function call is normal call or sibling call.
7994    It will generate different pattern accordingly.  */
7995
7996 void
7997 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
7998 {
7999   rtx call, callee, tmp;
8000   rtvec vec;
8001   machine_mode mode;
8002
8003   gcc_assert (MEM_P (mem));
8004   callee = XEXP (mem, 0);
8005   mode = GET_MODE (callee);
8006   gcc_assert (mode == Pmode);
8007
8008   /* Decide if we should generate indirect calls by loading the
8009      address of the callee into a register before performing
8010      the branch-and-link.  */
8011   if (SYMBOL_REF_P (callee)
8012       ? (aarch64_is_long_call_p (callee)
8013          || aarch64_is_noplt_call_p (callee))
8014       : !REG_P (callee))
8015     XEXP (mem, 0) = force_reg (mode, callee);
8016
8017   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
8018
8019   if (result != NULL_RTX)
8020     call = gen_rtx_SET (result, call);
8021
8022   if (sibcall)
8023     tmp = ret_rtx;
8024   else
8025     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
8026
8027   vec = gen_rtvec (2, call, tmp);
8028   call = gen_rtx_PARALLEL (VOIDmode, vec);
8029
8030   aarch64_emit_call_insn (call);
8031 }
8032
8033 /* Emit call insn with PAT and do aarch64-specific handling.  */
8034
8035 void
8036 aarch64_emit_call_insn (rtx pat)
8037 {
8038   rtx insn = emit_call_insn (pat);
8039
8040   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
8041   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
8042   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
8043 }
8044
8045 machine_mode
8046 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
8047 {
8048   machine_mode mode_x = GET_MODE (x);
8049   rtx_code code_x = GET_CODE (x);
8050
8051   /* All floating point compares return CCFP if it is an equality
8052      comparison, and CCFPE otherwise.  */
8053   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
8054     {
8055       switch (code)
8056         {
8057         case EQ:
8058         case NE:
8059         case UNORDERED:
8060         case ORDERED:
8061         case UNLT:
8062         case UNLE:
8063         case UNGT:
8064         case UNGE:
8065         case UNEQ:
8066           return CCFPmode;
8067
8068         case LT:
8069         case LE:
8070         case GT:
8071         case GE:
8072         case LTGT:
8073           return CCFPEmode;
8074
8075         default:
8076           gcc_unreachable ();
8077         }
8078     }
8079
8080   /* Equality comparisons of short modes against zero can be performed
8081      using the TST instruction with the appropriate bitmask.  */
8082   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
8083       && (code == EQ || code == NE)
8084       && (mode_x == HImode || mode_x == QImode))
8085     return CC_NZmode;
8086
8087   /* Similarly, comparisons of zero_extends from shorter modes can
8088      be performed using an ANDS with an immediate mask.  */
8089   if (y == const0_rtx && code_x == ZERO_EXTEND
8090       && (mode_x == SImode || mode_x == DImode)
8091       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
8092       && (code == EQ || code == NE))
8093     return CC_NZmode;
8094
8095   if ((mode_x == SImode || mode_x == DImode)
8096       && y == const0_rtx
8097       && (code == EQ || code == NE || code == LT || code == GE)
8098       && (code_x == PLUS || code_x == MINUS || code_x == AND
8099           || code_x == NEG
8100           || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
8101               && CONST_INT_P (XEXP (x, 2)))))
8102     return CC_NZmode;
8103
8104   /* A compare with a shifted operand.  Because of canonicalization,
8105      the comparison will have to be swapped when we emit the assembly
8106      code.  */
8107   if ((mode_x == SImode || mode_x == DImode)
8108       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
8109       && (code_x == ASHIFT || code_x == ASHIFTRT
8110           || code_x == LSHIFTRT
8111           || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
8112     return CC_SWPmode;
8113
8114   /* Similarly for a negated operand, but we can only do this for
8115      equalities.  */
8116   if ((mode_x == SImode || mode_x == DImode)
8117       && (REG_P (y) || GET_CODE (y) == SUBREG)
8118       && (code == EQ || code == NE)
8119       && code_x == NEG)
8120     return CC_Zmode;
8121
8122   /* A test for unsigned overflow from an addition.  */
8123   if ((mode_x == DImode || mode_x == TImode)
8124       && (code == LTU || code == GEU)
8125       && code_x == PLUS
8126       && rtx_equal_p (XEXP (x, 0), y))
8127     return CC_Cmode;
8128
8129   /* A test for unsigned overflow from an add with carry.  */
8130   if ((mode_x == DImode || mode_x == TImode)
8131       && (code == LTU || code == GEU)
8132       && code_x == PLUS
8133       && CONST_SCALAR_INT_P (y)
8134       && (rtx_mode_t (y, mode_x)
8135           == (wi::shwi (1, mode_x)
8136               << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
8137     return CC_ADCmode;
8138
8139   /* A test for signed overflow.  */
8140   if ((mode_x == DImode || mode_x == TImode)
8141       && code == NE
8142       && code_x == PLUS
8143       && GET_CODE (y) == SIGN_EXTEND)
8144     return CC_Vmode;
8145
8146   /* For everything else, return CCmode.  */
8147   return CCmode;
8148 }
8149
8150 static int
8151 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
8152
8153 int
8154 aarch64_get_condition_code (rtx x)
8155 {
8156   machine_mode mode = GET_MODE (XEXP (x, 0));
8157   enum rtx_code comp_code = GET_CODE (x);
8158
8159   if (GET_MODE_CLASS (mode) != MODE_CC)
8160     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
8161   return aarch64_get_condition_code_1 (mode, comp_code);
8162 }
8163
8164 static int
8165 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
8166 {
8167   switch (mode)
8168     {
8169     case E_CCFPmode:
8170     case E_CCFPEmode:
8171       switch (comp_code)
8172         {
8173         case GE: return AARCH64_GE;
8174         case GT: return AARCH64_GT;
8175         case LE: return AARCH64_LS;
8176         case LT: return AARCH64_MI;
8177         case NE: return AARCH64_NE;
8178         case EQ: return AARCH64_EQ;
8179         case ORDERED: return AARCH64_VC;
8180         case UNORDERED: return AARCH64_VS;
8181         case UNLT: return AARCH64_LT;
8182         case UNLE: return AARCH64_LE;
8183         case UNGT: return AARCH64_HI;
8184         case UNGE: return AARCH64_PL;
8185         default: return -1;
8186         }
8187       break;
8188
8189     case E_CCmode:
8190       switch (comp_code)
8191         {
8192         case NE: return AARCH64_NE;
8193         case EQ: return AARCH64_EQ;
8194         case GE: return AARCH64_GE;
8195         case GT: return AARCH64_GT;
8196         case LE: return AARCH64_LE;
8197         case LT: return AARCH64_LT;
8198         case GEU: return AARCH64_CS;
8199         case GTU: return AARCH64_HI;
8200         case LEU: return AARCH64_LS;
8201         case LTU: return AARCH64_CC;
8202         default: return -1;
8203         }
8204       break;
8205
8206     case E_CC_SWPmode:
8207       switch (comp_code)
8208         {
8209         case NE: return AARCH64_NE;
8210         case EQ: return AARCH64_EQ;
8211         case GE: return AARCH64_LE;
8212         case GT: return AARCH64_LT;
8213         case LE: return AARCH64_GE;
8214         case LT: return AARCH64_GT;
8215         case GEU: return AARCH64_LS;
8216         case GTU: return AARCH64_CC;
8217         case LEU: return AARCH64_CS;
8218         case LTU: return AARCH64_HI;
8219         default: return -1;
8220         }
8221       break;
8222
8223     case E_CC_NZCmode:
8224       switch (comp_code)
8225         {
8226         case NE: return AARCH64_NE; /* = any */
8227         case EQ: return AARCH64_EQ; /* = none */
8228         case GE: return AARCH64_PL; /* = nfrst */
8229         case LT: return AARCH64_MI; /* = first */
8230         case GEU: return AARCH64_CS; /* = nlast */
8231         case GTU: return AARCH64_HI; /* = pmore */
8232         case LEU: return AARCH64_LS; /* = plast */
8233         case LTU: return AARCH64_CC; /* = last */
8234         default: return -1;
8235         }
8236       break;
8237
8238     case E_CC_NZmode:
8239       switch (comp_code)
8240         {
8241         case NE: return AARCH64_NE;
8242         case EQ: return AARCH64_EQ;
8243         case GE: return AARCH64_PL;
8244         case LT: return AARCH64_MI;
8245         default: return -1;
8246         }
8247       break;
8248
8249     case E_CC_Zmode:
8250       switch (comp_code)
8251         {
8252         case NE: return AARCH64_NE;
8253         case EQ: return AARCH64_EQ;
8254         default: return -1;
8255         }
8256       break;
8257
8258     case E_CC_Cmode:
8259       switch (comp_code)
8260         {
8261         case LTU: return AARCH64_CS;
8262         case GEU: return AARCH64_CC;
8263         default: return -1;
8264         }
8265       break;
8266
8267     case E_CC_ADCmode:
8268       switch (comp_code)
8269         {
8270         case GEU: return AARCH64_CS;
8271         case LTU: return AARCH64_CC;
8272         default: return -1;
8273         }
8274       break;
8275
8276     case E_CC_Vmode:
8277       switch (comp_code)
8278         {
8279         case NE: return AARCH64_VS;
8280         case EQ: return AARCH64_VC;
8281         default: return -1;
8282         }
8283       break;
8284
8285     default:
8286       return -1;
8287     }
8288
8289   return -1;
8290 }
8291
8292 bool
8293 aarch64_const_vec_all_same_in_range_p (rtx x,
8294                                        HOST_WIDE_INT minval,
8295                                        HOST_WIDE_INT maxval)
8296 {
8297   rtx elt;
8298   return (const_vec_duplicate_p (x, &elt)
8299           && CONST_INT_P (elt)
8300           && IN_RANGE (INTVAL (elt), minval, maxval));
8301 }
8302
8303 bool
8304 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
8305 {
8306   return aarch64_const_vec_all_same_in_range_p (x, val, val);
8307 }
8308
8309 /* Return true if VEC is a constant in which every element is in the range
8310    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
8311
8312 static bool
8313 aarch64_const_vec_all_in_range_p (rtx vec,
8314                                   HOST_WIDE_INT minval,
8315                                   HOST_WIDE_INT maxval)
8316 {
8317   if (GET_CODE (vec) != CONST_VECTOR
8318       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
8319     return false;
8320
8321   int nunits;
8322   if (!CONST_VECTOR_STEPPED_P (vec))
8323     nunits = const_vector_encoded_nelts (vec);
8324   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
8325     return false;
8326
8327   for (int i = 0; i < nunits; i++)
8328     {
8329       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
8330       if (!CONST_INT_P (vec_elem)
8331           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
8332         return false;
8333     }
8334   return true;
8335 }
8336
8337 /* N Z C V.  */
8338 #define AARCH64_CC_V 1
8339 #define AARCH64_CC_C (1 << 1)
8340 #define AARCH64_CC_Z (1 << 2)
8341 #define AARCH64_CC_N (1 << 3)
8342
8343 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
8344 static const int aarch64_nzcv_codes[] =
8345 {
8346   0,            /* EQ, Z == 1.  */
8347   AARCH64_CC_Z, /* NE, Z == 0.  */
8348   0,            /* CS, C == 1.  */
8349   AARCH64_CC_C, /* CC, C == 0.  */
8350   0,            /* MI, N == 1.  */
8351   AARCH64_CC_N, /* PL, N == 0.  */
8352   0,            /* VS, V == 1.  */
8353   AARCH64_CC_V, /* VC, V == 0.  */
8354   0,            /* HI, C ==1 && Z == 0.  */
8355   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
8356   AARCH64_CC_V, /* GE, N == V.  */
8357   0,            /* LT, N != V.  */
8358   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
8359   0,            /* LE, !(Z == 0 && N == V).  */
8360   0,            /* AL, Any.  */
8361   0             /* NV, Any.  */
8362 };
8363
8364 /* Print floating-point vector immediate operand X to F, negating it
8365    first if NEGATE is true.  Return true on success, false if it isn't
8366    a constant we can handle.  */
8367
8368 static bool
8369 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
8370 {
8371   rtx elt;
8372
8373   if (!const_vec_duplicate_p (x, &elt))
8374     return false;
8375
8376   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
8377   if (negate)
8378     r = real_value_negate (&r);
8379
8380   /* Handle the SVE single-bit immediates specially, since they have a
8381      fixed form in the assembly syntax.  */
8382   if (real_equal (&r, &dconst0))
8383     asm_fprintf (f, "0.0");
8384   else if (real_equal (&r, &dconst2))
8385     asm_fprintf (f, "2.0");
8386   else if (real_equal (&r, &dconst1))
8387     asm_fprintf (f, "1.0");
8388   else if (real_equal (&r, &dconsthalf))
8389     asm_fprintf (f, "0.5");
8390   else
8391     {
8392       const int buf_size = 20;
8393       char float_buf[buf_size] = {'\0'};
8394       real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
8395                                 1, GET_MODE (elt));
8396       asm_fprintf (f, "%s", float_buf);
8397     }
8398
8399   return true;
8400 }
8401
8402 /* Return the equivalent letter for size.  */
8403 static char
8404 sizetochar (int size)
8405 {
8406   switch (size)
8407     {
8408     case 64: return 'd';
8409     case 32: return 's';
8410     case 16: return 'h';
8411     case 8 : return 'b';
8412     default: gcc_unreachable ();
8413     }
8414 }
8415
8416 /* Print operand X to file F in a target specific manner according to CODE.
8417    The acceptable formatting commands given by CODE are:
8418      'c':               An integer or symbol address without a preceding #
8419                         sign.
8420      'C':               Take the duplicated element in a vector constant
8421                         and print it in hex.
8422      'D':               Take the duplicated element in a vector constant
8423                         and print it as an unsigned integer, in decimal.
8424      'e':               Print the sign/zero-extend size as a character 8->b,
8425                         16->h, 32->w.  Can also be used for masks:
8426                         0xff->b, 0xffff->h, 0xffffffff->w.
8427      'I':               If the operand is a duplicated vector constant,
8428                         replace it with the duplicated scalar.  If the
8429                         operand is then a floating-point constant, replace
8430                         it with the integer bit representation.  Print the
8431                         transformed constant as a signed decimal number.
8432      'p':               Prints N such that 2^N == X (X must be power of 2 and
8433                         const int).
8434      'P':               Print the number of non-zero bits in X (a const_int).
8435      'H':               Print the higher numbered register of a pair (TImode)
8436                         of regs.
8437      'm':               Print a condition (eq, ne, etc).
8438      'M':               Same as 'm', but invert condition.
8439      'N':               Take the duplicated element in a vector constant
8440                         and print the negative of it in decimal.
8441      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
8442      'S/T/U/V':         Print a FP/SIMD register name for a register list.
8443                         The register printed is the FP/SIMD register name
8444                         of X + 0/1/2/3 for S/T/U/V.
8445      'R':               Print a scalar Integer/FP/SIMD register name + 1.
8446      'X':               Print bottom 16 bits of integer constant in hex.
8447      'w/x':             Print a general register name or the zero register
8448                         (32-bit or 64-bit).
8449      '0':               Print a normal operand, if it's a general register,
8450                         then we assume DImode.
8451      'k':               Print NZCV for conditional compare instructions.
8452      'A':               Output address constant representing the first
8453                         argument of X, specifying a relocation offset
8454                         if appropriate.
8455      'L':               Output constant address specified by X
8456                         with a relocation offset if appropriate.
8457      'G':               Prints address of X, specifying a PC relative
8458                         relocation mode if appropriate.
8459      'y':               Output address of LDP or STP - this is used for
8460                         some LDP/STPs which don't use a PARALLEL in their
8461                         pattern (so the mode needs to be adjusted).
8462      'z':               Output address of a typical LDP or STP.  */
8463
8464 static void
8465 aarch64_print_operand (FILE *f, rtx x, int code)
8466 {
8467   rtx elt;
8468   switch (code)
8469     {
8470     case 'c':
8471       switch (GET_CODE (x))
8472         {
8473         case CONST_INT:
8474           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
8475           break;
8476
8477         case SYMBOL_REF:
8478           output_addr_const (f, x);
8479           break;
8480
8481         case CONST:
8482           if (GET_CODE (XEXP (x, 0)) == PLUS
8483               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
8484             {
8485               output_addr_const (f, x);
8486               break;
8487             }
8488           /* Fall through.  */
8489
8490         default:
8491           output_operand_lossage ("unsupported operand for code '%c'", code);
8492         }
8493       break;
8494
8495     case 'e':
8496       {
8497         x = unwrap_const_vec_duplicate (x);
8498         if (!CONST_INT_P (x))
8499           {
8500             output_operand_lossage ("invalid operand for '%%%c'", code);
8501             return;
8502           }
8503
8504         HOST_WIDE_INT val = INTVAL (x);
8505         if ((val & ~7) == 8 || val == 0xff)
8506           fputc ('b', f);
8507         else if ((val & ~7) == 16 || val == 0xffff)
8508           fputc ('h', f);
8509         else if ((val & ~7) == 32 || val == 0xffffffff)
8510           fputc ('w', f);
8511         else
8512           {
8513             output_operand_lossage ("invalid operand for '%%%c'", code);
8514             return;
8515           }
8516       }
8517       break;
8518
8519     case 'p':
8520       {
8521         int n;
8522
8523         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
8524           {
8525             output_operand_lossage ("invalid operand for '%%%c'", code);
8526             return;
8527           }
8528
8529         asm_fprintf (f, "%d", n);
8530       }
8531       break;
8532
8533     case 'P':
8534       if (!CONST_INT_P (x))
8535         {
8536           output_operand_lossage ("invalid operand for '%%%c'", code);
8537           return;
8538         }
8539
8540       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
8541       break;
8542
8543     case 'H':
8544       if (x == const0_rtx)
8545         {
8546           asm_fprintf (f, "xzr");
8547           break;
8548         }
8549
8550       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
8551         {
8552           output_operand_lossage ("invalid operand for '%%%c'", code);
8553           return;
8554         }
8555
8556       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
8557       break;
8558
8559     case 'I':
8560       {
8561         x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
8562         if (CONST_INT_P (x))
8563           asm_fprintf (f, "%wd", INTVAL (x));
8564         else
8565           {
8566             output_operand_lossage ("invalid operand for '%%%c'", code);
8567             return;
8568           }
8569         break;
8570       }
8571
8572     case 'M':
8573     case 'm':
8574       {
8575         int cond_code;
8576         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
8577         if (x == const_true_rtx)
8578           {
8579             if (code == 'M')
8580               fputs ("nv", f);
8581             return;
8582           }
8583
8584         if (!COMPARISON_P (x))
8585           {
8586             output_operand_lossage ("invalid operand for '%%%c'", code);
8587             return;
8588           }
8589
8590         cond_code = aarch64_get_condition_code (x);
8591         gcc_assert (cond_code >= 0);
8592         if (code == 'M')
8593           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
8594         if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
8595           fputs (aarch64_sve_condition_codes[cond_code], f);
8596         else
8597           fputs (aarch64_condition_codes[cond_code], f);
8598       }
8599       break;
8600
8601     case 'N':
8602       if (!const_vec_duplicate_p (x, &elt))
8603         {
8604           output_operand_lossage ("invalid vector constant");
8605           return;
8606         }
8607
8608       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8609         asm_fprintf (f, "%wd", -INTVAL (elt));
8610       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8611                && aarch64_print_vector_float_operand (f, x, true))
8612         ;
8613       else
8614         {
8615           output_operand_lossage ("invalid vector constant");
8616           return;
8617         }
8618       break;
8619
8620     case 'b':
8621     case 'h':
8622     case 's':
8623     case 'd':
8624     case 'q':
8625       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8626         {
8627           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8628           return;
8629         }
8630       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
8631       break;
8632
8633     case 'S':
8634     case 'T':
8635     case 'U':
8636     case 'V':
8637       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8638         {
8639           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8640           return;
8641         }
8642       asm_fprintf (f, "%c%d",
8643                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
8644                    REGNO (x) - V0_REGNUM + (code - 'S'));
8645       break;
8646
8647     case 'R':
8648       if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
8649         asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
8650       else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
8651         asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
8652       else
8653         output_operand_lossage ("incompatible register operand for '%%%c'",
8654                                 code);
8655       break;
8656
8657     case 'X':
8658       if (!CONST_INT_P (x))
8659         {
8660           output_operand_lossage ("invalid operand for '%%%c'", code);
8661           return;
8662         }
8663       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
8664       break;
8665
8666     case 'C':
8667       {
8668         /* Print a replicated constant in hex.  */
8669         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8670           {
8671             output_operand_lossage ("invalid operand for '%%%c'", code);
8672             return;
8673           }
8674         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8675         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8676       }
8677       break;
8678
8679     case 'D':
8680       {
8681         /* Print a replicated constant in decimal, treating it as
8682            unsigned.  */
8683         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8684           {
8685             output_operand_lossage ("invalid operand for '%%%c'", code);
8686             return;
8687           }
8688         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8689         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8690       }
8691       break;
8692
8693     case 'w':
8694     case 'x':
8695       if (x == const0_rtx
8696           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
8697         {
8698           asm_fprintf (f, "%czr", code);
8699           break;
8700         }
8701
8702       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
8703         {
8704           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
8705           break;
8706         }
8707
8708       if (REG_P (x) && REGNO (x) == SP_REGNUM)
8709         {
8710           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
8711           break;
8712         }
8713
8714       /* Fall through */
8715
8716     case 0:
8717       if (x == NULL)
8718         {
8719           output_operand_lossage ("missing operand");
8720           return;
8721         }
8722
8723       switch (GET_CODE (x))
8724         {
8725         case REG:
8726           if (aarch64_sve_data_mode_p (GET_MODE (x)))
8727             {
8728               if (REG_NREGS (x) == 1)
8729                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
8730               else
8731                 {
8732                   char suffix
8733                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
8734                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
8735                                REGNO (x) - V0_REGNUM, suffix,
8736                                END_REGNO (x) - V0_REGNUM - 1, suffix);
8737                 }
8738             }
8739           else
8740             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
8741           break;
8742
8743         case MEM:
8744           output_address (GET_MODE (x), XEXP (x, 0));
8745           break;
8746
8747         case LABEL_REF:
8748         case SYMBOL_REF:
8749           output_addr_const (asm_out_file, x);
8750           break;
8751
8752         case CONST_INT:
8753           asm_fprintf (f, "%wd", INTVAL (x));
8754           break;
8755
8756         case CONST:
8757           if (!VECTOR_MODE_P (GET_MODE (x)))
8758             {
8759               output_addr_const (asm_out_file, x);
8760               break;
8761             }
8762           /* fall through */
8763
8764         case CONST_VECTOR:
8765           if (!const_vec_duplicate_p (x, &elt))
8766             {
8767               output_operand_lossage ("invalid vector constant");
8768               return;
8769             }
8770
8771           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8772             asm_fprintf (f, "%wd", INTVAL (elt));
8773           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8774                    && aarch64_print_vector_float_operand (f, x, false))
8775             ;
8776           else
8777             {
8778               output_operand_lossage ("invalid vector constant");
8779               return;
8780             }
8781           break;
8782
8783         case CONST_DOUBLE:
8784           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
8785              be getting CONST_DOUBLEs holding integers.  */
8786           gcc_assert (GET_MODE (x) != VOIDmode);
8787           if (aarch64_float_const_zero_rtx_p (x))
8788             {
8789               fputc ('0', f);
8790               break;
8791             }
8792           else if (aarch64_float_const_representable_p (x))
8793             {
8794 #define buf_size 20
8795               char float_buf[buf_size] = {'\0'};
8796               real_to_decimal_for_mode (float_buf,
8797                                         CONST_DOUBLE_REAL_VALUE (x),
8798                                         buf_size, buf_size,
8799                                         1, GET_MODE (x));
8800               asm_fprintf (asm_out_file, "%s", float_buf);
8801               break;
8802 #undef buf_size
8803             }
8804           output_operand_lossage ("invalid constant");
8805           return;
8806         default:
8807           output_operand_lossage ("invalid operand");
8808           return;
8809         }
8810       break;
8811
8812     case 'A':
8813       if (GET_CODE (x) == HIGH)
8814         x = XEXP (x, 0);
8815
8816       switch (aarch64_classify_symbolic_expression (x))
8817         {
8818         case SYMBOL_SMALL_GOT_4G:
8819           asm_fprintf (asm_out_file, ":got:");
8820           break;
8821
8822         case SYMBOL_SMALL_TLSGD:
8823           asm_fprintf (asm_out_file, ":tlsgd:");
8824           break;
8825
8826         case SYMBOL_SMALL_TLSDESC:
8827           asm_fprintf (asm_out_file, ":tlsdesc:");
8828           break;
8829
8830         case SYMBOL_SMALL_TLSIE:
8831           asm_fprintf (asm_out_file, ":gottprel:");
8832           break;
8833
8834         case SYMBOL_TLSLE24:
8835           asm_fprintf (asm_out_file, ":tprel:");
8836           break;
8837
8838         case SYMBOL_TINY_GOT:
8839           gcc_unreachable ();
8840           break;
8841
8842         default:
8843           break;
8844         }
8845       output_addr_const (asm_out_file, x);
8846       break;
8847
8848     case 'L':
8849       switch (aarch64_classify_symbolic_expression (x))
8850         {
8851         case SYMBOL_SMALL_GOT_4G:
8852           asm_fprintf (asm_out_file, ":lo12:");
8853           break;
8854
8855         case SYMBOL_SMALL_TLSGD:
8856           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
8857           break;
8858
8859         case SYMBOL_SMALL_TLSDESC:
8860           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
8861           break;
8862
8863         case SYMBOL_SMALL_TLSIE:
8864           asm_fprintf (asm_out_file, ":gottprel_lo12:");
8865           break;
8866
8867         case SYMBOL_TLSLE12:
8868           asm_fprintf (asm_out_file, ":tprel_lo12:");
8869           break;
8870
8871         case SYMBOL_TLSLE24:
8872           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
8873           break;
8874
8875         case SYMBOL_TINY_GOT:
8876           asm_fprintf (asm_out_file, ":got:");
8877           break;
8878
8879         case SYMBOL_TINY_TLSIE:
8880           asm_fprintf (asm_out_file, ":gottprel:");
8881           break;
8882
8883         default:
8884           break;
8885         }
8886       output_addr_const (asm_out_file, x);
8887       break;
8888
8889     case 'G':
8890       switch (aarch64_classify_symbolic_expression (x))
8891         {
8892         case SYMBOL_TLSLE24:
8893           asm_fprintf (asm_out_file, ":tprel_hi12:");
8894           break;
8895         default:
8896           break;
8897         }
8898       output_addr_const (asm_out_file, x);
8899       break;
8900
8901     case 'k':
8902       {
8903         HOST_WIDE_INT cond_code;
8904
8905         if (!CONST_INT_P (x))
8906           {
8907             output_operand_lossage ("invalid operand for '%%%c'", code);
8908             return;
8909           }
8910
8911         cond_code = INTVAL (x);
8912         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
8913         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
8914       }
8915       break;
8916
8917     case 'y':
8918     case 'z':
8919       {
8920         machine_mode mode = GET_MODE (x);
8921
8922         if (GET_CODE (x) != MEM
8923             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
8924           {
8925             output_operand_lossage ("invalid operand for '%%%c'", code);
8926             return;
8927           }
8928
8929         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
8930                                             code == 'y'
8931                                             ? ADDR_QUERY_LDP_STP_N
8932                                             : ADDR_QUERY_LDP_STP))
8933           output_operand_lossage ("invalid operand prefix '%%%c'", code);
8934       }
8935       break;
8936
8937     default:
8938       output_operand_lossage ("invalid operand prefix '%%%c'", code);
8939       return;
8940     }
8941 }
8942
8943 /* Print address 'x' of a memory access with mode 'mode'.
8944    'op' is the context required by aarch64_classify_address.  It can either be
8945    MEM for a normal memory access or PARALLEL for LDP/STP.  */
8946 static bool
8947 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
8948                                 aarch64_addr_query_type type)
8949 {
8950   struct aarch64_address_info addr;
8951   unsigned int size;
8952
8953   /* Check all addresses are Pmode - including ILP32.  */
8954   if (GET_MODE (x) != Pmode
8955       && (!CONST_INT_P (x)
8956           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
8957     {
8958       output_operand_lossage ("invalid address mode");
8959       return false;
8960     }
8961
8962   if (aarch64_classify_address (&addr, x, mode, true, type))
8963     switch (addr.type)
8964       {
8965       case ADDRESS_REG_IMM:
8966         if (known_eq (addr.const_offset, 0))
8967           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8968         else if (aarch64_sve_data_mode_p (mode))
8969           {
8970             HOST_WIDE_INT vnum
8971               = exact_div (addr.const_offset,
8972                            BYTES_PER_SVE_VECTOR).to_constant ();
8973             asm_fprintf (f, "[%s, #%wd, mul vl]",
8974                          reg_names[REGNO (addr.base)], vnum);
8975           }
8976         else if (aarch64_sve_pred_mode_p (mode))
8977           {
8978             HOST_WIDE_INT vnum
8979               = exact_div (addr.const_offset,
8980                            BYTES_PER_SVE_PRED).to_constant ();
8981             asm_fprintf (f, "[%s, #%wd, mul vl]",
8982                          reg_names[REGNO (addr.base)], vnum);
8983           }
8984         else
8985           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
8986                        INTVAL (addr.offset));
8987         return true;
8988
8989       case ADDRESS_REG_REG:
8990         if (addr.shift == 0)
8991           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
8992                        reg_names [REGNO (addr.offset)]);
8993         else
8994           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
8995                        reg_names [REGNO (addr.offset)], addr.shift);
8996         return true;
8997
8998       case ADDRESS_REG_UXTW:
8999         if (addr.shift == 0)
9000           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
9001                        REGNO (addr.offset) - R0_REGNUM);
9002         else
9003           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
9004                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
9005         return true;
9006
9007       case ADDRESS_REG_SXTW:
9008         if (addr.shift == 0)
9009           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
9010                        REGNO (addr.offset) - R0_REGNUM);
9011         else
9012           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
9013                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
9014         return true;
9015
9016       case ADDRESS_REG_WB:
9017         /* Writeback is only supported for fixed-width modes.  */
9018         size = GET_MODE_SIZE (mode).to_constant ();
9019         switch (GET_CODE (x))
9020           {
9021           case PRE_INC:
9022             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
9023             return true;
9024           case POST_INC:
9025             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
9026             return true;
9027           case PRE_DEC:
9028             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
9029             return true;
9030           case POST_DEC:
9031             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
9032             return true;
9033           case PRE_MODIFY:
9034             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
9035                          INTVAL (addr.offset));
9036             return true;
9037           case POST_MODIFY:
9038             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
9039                          INTVAL (addr.offset));
9040             return true;
9041           default:
9042             break;
9043           }
9044         break;
9045
9046       case ADDRESS_LO_SUM:
9047         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
9048         output_addr_const (f, addr.offset);
9049         asm_fprintf (f, "]");
9050         return true;
9051
9052       case ADDRESS_SYMBOLIC:
9053         output_addr_const (f, x);
9054         return true;
9055       }
9056
9057   return false;
9058 }
9059
9060 /* Print address 'x' of a memory access with mode 'mode'.  */
9061 static void
9062 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
9063 {
9064   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
9065     output_addr_const (f, x);
9066 }
9067
9068 bool
9069 aarch64_label_mentioned_p (rtx x)
9070 {
9071   const char *fmt;
9072   int i;
9073
9074   if (GET_CODE (x) == LABEL_REF)
9075     return true;
9076
9077   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
9078      referencing instruction, but they are constant offsets, not
9079      symbols.  */
9080   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
9081     return false;
9082
9083   fmt = GET_RTX_FORMAT (GET_CODE (x));
9084   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
9085     {
9086       if (fmt[i] == 'E')
9087         {
9088           int j;
9089
9090           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
9091             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
9092               return 1;
9093         }
9094       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
9095         return 1;
9096     }
9097
9098   return 0;
9099 }
9100
9101 /* Implement REGNO_REG_CLASS.  */
9102
9103 enum reg_class
9104 aarch64_regno_regclass (unsigned regno)
9105 {
9106   if (GP_REGNUM_P (regno))
9107     return GENERAL_REGS;
9108
9109   if (regno == SP_REGNUM)
9110     return STACK_REG;
9111
9112   if (regno == FRAME_POINTER_REGNUM
9113       || regno == ARG_POINTER_REGNUM)
9114     return POINTER_REGS;
9115
9116   if (FP_REGNUM_P (regno))
9117     return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
9118             : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
9119
9120   if (PR_REGNUM_P (regno))
9121     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
9122
9123   return NO_REGS;
9124 }
9125
9126 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
9127    If OFFSET is out of range, return an offset of an anchor point
9128    that is in range.  Return 0 otherwise.  */
9129
9130 static HOST_WIDE_INT
9131 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
9132                        machine_mode mode)
9133 {
9134   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
9135   if (size > 16)
9136     return (offset + 0x400) & ~0x7f0;
9137
9138   /* For offsets that aren't a multiple of the access size, the limit is
9139      -256...255.  */
9140   if (offset & (size - 1))
9141     {
9142       /* BLKmode typically uses LDP of X-registers.  */
9143       if (mode == BLKmode)
9144         return (offset + 512) & ~0x3ff;
9145       return (offset + 0x100) & ~0x1ff;
9146     }
9147
9148   /* Small negative offsets are supported.  */
9149   if (IN_RANGE (offset, -256, 0))
9150     return 0;
9151
9152   if (mode == TImode || mode == TFmode)
9153     return (offset + 0x100) & ~0x1ff;
9154
9155   /* Use 12-bit offset by access size.  */
9156   return offset & (~0xfff * size);
9157 }
9158
9159 static rtx
9160 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
9161 {
9162   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
9163      where mask is selected by alignment and size of the offset.
9164      We try to pick as large a range for the offset as possible to
9165      maximize the chance of a CSE.  However, for aligned addresses
9166      we limit the range to 4k so that structures with different sized
9167      elements are likely to use the same base.  We need to be careful
9168      not to split a CONST for some forms of address expression, otherwise
9169      it will generate sub-optimal code.  */
9170
9171   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
9172     {
9173       rtx base = XEXP (x, 0);
9174       rtx offset_rtx = XEXP (x, 1);
9175       HOST_WIDE_INT offset = INTVAL (offset_rtx);
9176
9177       if (GET_CODE (base) == PLUS)
9178         {
9179           rtx op0 = XEXP (base, 0);
9180           rtx op1 = XEXP (base, 1);
9181
9182           /* Force any scaling into a temp for CSE.  */
9183           op0 = force_reg (Pmode, op0);
9184           op1 = force_reg (Pmode, op1);
9185
9186           /* Let the pointer register be in op0.  */
9187           if (REG_POINTER (op1))
9188             std::swap (op0, op1);
9189
9190           /* If the pointer is virtual or frame related, then we know that
9191              virtual register instantiation or register elimination is going
9192              to apply a second constant.  We want the two constants folded
9193              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
9194           if (virt_or_elim_regno_p (REGNO (op0)))
9195             {
9196               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
9197                                    NULL_RTX, true, OPTAB_DIRECT);
9198               return gen_rtx_PLUS (Pmode, base, op1);
9199             }
9200
9201           /* Otherwise, in order to encourage CSE (and thence loop strength
9202              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
9203           base = expand_binop (Pmode, add_optab, op0, op1,
9204                                NULL_RTX, true, OPTAB_DIRECT);
9205           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
9206         }
9207
9208       HOST_WIDE_INT size;
9209       if (GET_MODE_SIZE (mode).is_constant (&size))
9210         {
9211           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
9212                                                              mode);
9213           if (base_offset != 0)
9214             {
9215               base = plus_constant (Pmode, base, base_offset);
9216               base = force_operand (base, NULL_RTX);
9217               return plus_constant (Pmode, base, offset - base_offset);
9218             }
9219         }
9220     }
9221
9222   return x;
9223 }
9224
9225 static reg_class_t
9226 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
9227                           reg_class_t rclass,
9228                           machine_mode mode,
9229                           secondary_reload_info *sri)
9230 {
9231   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
9232      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
9233      comment at the head of aarch64-sve.md for more details about the
9234      big-endian handling.  */
9235   if (BYTES_BIG_ENDIAN
9236       && reg_class_subset_p (rclass, FP_REGS)
9237       && !((REG_P (x) && HARD_REGISTER_P (x))
9238            || aarch64_simd_valid_immediate (x, NULL))
9239       && aarch64_sve_data_mode_p (mode))
9240     {
9241       sri->icode = CODE_FOR_aarch64_sve_reload_be;
9242       return NO_REGS;
9243     }
9244
9245   /* If we have to disable direct literal pool loads and stores because the
9246      function is too big, then we need a scratch register.  */
9247   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
9248       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
9249           || targetm.vector_mode_supported_p (GET_MODE (x)))
9250       && !aarch64_pcrelative_literal_loads)
9251     {
9252       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
9253       return NO_REGS;
9254     }
9255
9256   /* Without the TARGET_SIMD instructions we cannot move a Q register
9257      to a Q register directly.  We need a scratch.  */
9258   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
9259       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
9260       && reg_class_subset_p (rclass, FP_REGS))
9261     {
9262       sri->icode = code_for_aarch64_reload_mov (mode);
9263       return NO_REGS;
9264     }
9265
9266   /* A TFmode or TImode memory access should be handled via an FP_REGS
9267      because AArch64 has richer addressing modes for LDR/STR instructions
9268      than LDP/STP instructions.  */
9269   if (TARGET_FLOAT && rclass == GENERAL_REGS
9270       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
9271     return FP_REGS;
9272
9273   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
9274       return GENERAL_REGS;
9275
9276   return NO_REGS;
9277 }
9278
9279 static bool
9280 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
9281 {
9282   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
9283
9284   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
9285      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
9286   if (frame_pointer_needed)
9287     return to == HARD_FRAME_POINTER_REGNUM;
9288   return true;
9289 }
9290
9291 poly_int64
9292 aarch64_initial_elimination_offset (unsigned from, unsigned to)
9293 {
9294   if (to == HARD_FRAME_POINTER_REGNUM)
9295     {
9296       if (from == ARG_POINTER_REGNUM)
9297         return cfun->machine->frame.hard_fp_offset;
9298
9299       if (from == FRAME_POINTER_REGNUM)
9300         return cfun->machine->frame.hard_fp_offset
9301                - cfun->machine->frame.locals_offset;
9302     }
9303
9304   if (to == STACK_POINTER_REGNUM)
9305     {
9306       if (from == FRAME_POINTER_REGNUM)
9307           return cfun->machine->frame.frame_size
9308                  - cfun->machine->frame.locals_offset;
9309     }
9310
9311   return cfun->machine->frame.frame_size;
9312 }
9313
9314 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
9315    previous frame.  */
9316
9317 rtx
9318 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
9319 {
9320   if (count != 0)
9321     return const0_rtx;
9322   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
9323 }
9324
9325
9326 static void
9327 aarch64_asm_trampoline_template (FILE *f)
9328 {
9329   int offset1 = 16;
9330   int offset2 = 20;
9331
9332   if (aarch64_bti_enabled ())
9333     {
9334       asm_fprintf (f, "\thint\t34 // bti c\n");
9335       offset1 -= 4;
9336       offset2 -= 4;
9337     }
9338
9339   if (TARGET_ILP32)
9340     {
9341       asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
9342       asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
9343                    offset1);
9344     }
9345   else
9346     {
9347       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
9348       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
9349                    offset2);
9350     }
9351   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
9352
9353   /* The trampoline needs an extra padding instruction.  In case if BTI is
9354      enabled the padding instruction is replaced by the BTI instruction at
9355      the beginning.  */
9356   if (!aarch64_bti_enabled ())
9357     assemble_aligned_integer (4, const0_rtx);
9358
9359   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9360   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9361 }
9362
9363 static void
9364 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
9365 {
9366   rtx fnaddr, mem, a_tramp;
9367   const int tramp_code_sz = 16;
9368
9369   /* Don't need to copy the trailing D-words, we fill those in below.  */
9370   emit_block_move (m_tramp, assemble_trampoline_template (),
9371                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
9372   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
9373   fnaddr = XEXP (DECL_RTL (fndecl), 0);
9374   if (GET_MODE (fnaddr) != ptr_mode)
9375     fnaddr = convert_memory_address (ptr_mode, fnaddr);
9376   emit_move_insn (mem, fnaddr);
9377
9378   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
9379   emit_move_insn (mem, chain_value);
9380
9381   /* XXX We should really define a "clear_cache" pattern and use
9382      gen_clear_cache().  */
9383   a_tramp = XEXP (m_tramp, 0);
9384   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
9385                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
9386                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
9387                      ptr_mode);
9388 }
9389
9390 static unsigned char
9391 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
9392 {
9393   /* ??? Logically we should only need to provide a value when
9394      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
9395      can hold MODE, but at the moment we need to handle all modes.
9396      Just ignore any runtime parts for registers that can't store them.  */
9397   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
9398   unsigned int nregs;
9399   switch (regclass)
9400     {
9401     case TAILCALL_ADDR_REGS:
9402     case POINTER_REGS:
9403     case GENERAL_REGS:
9404     case ALL_REGS:
9405     case POINTER_AND_FP_REGS:
9406     case FP_REGS:
9407     case FP_LO_REGS:
9408     case FP_LO8_REGS:
9409       if (aarch64_sve_data_mode_p (mode)
9410           && constant_multiple_p (GET_MODE_SIZE (mode),
9411                                   BYTES_PER_SVE_VECTOR, &nregs))
9412         return nregs;
9413       return (aarch64_vector_data_mode_p (mode)
9414               ? CEIL (lowest_size, UNITS_PER_VREG)
9415               : CEIL (lowest_size, UNITS_PER_WORD));
9416     case STACK_REG:
9417     case PR_REGS:
9418     case PR_LO_REGS:
9419     case PR_HI_REGS:
9420       return 1;
9421
9422     case NO_REGS:
9423       return 0;
9424
9425     default:
9426       break;
9427     }
9428   gcc_unreachable ();
9429 }
9430
9431 static reg_class_t
9432 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
9433 {
9434   if (regclass == POINTER_REGS)
9435     return GENERAL_REGS;
9436
9437   if (regclass == STACK_REG)
9438     {
9439       if (REG_P(x)
9440           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
9441           return regclass;
9442
9443       return NO_REGS;
9444     }
9445
9446   /* Register eliminiation can result in a request for
9447      SP+constant->FP_REGS.  We cannot support such operations which
9448      use SP as source and an FP_REG as destination, so reject out
9449      right now.  */
9450   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
9451     {
9452       rtx lhs = XEXP (x, 0);
9453
9454       /* Look through a possible SUBREG introduced by ILP32.  */
9455       if (GET_CODE (lhs) == SUBREG)
9456         lhs = SUBREG_REG (lhs);
9457
9458       gcc_assert (REG_P (lhs));
9459       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
9460                                       POINTER_REGS));
9461       return NO_REGS;
9462     }
9463
9464   return regclass;
9465 }
9466
9467 void
9468 aarch64_asm_output_labelref (FILE* f, const char *name)
9469 {
9470   asm_fprintf (f, "%U%s", name);
9471 }
9472
9473 static void
9474 aarch64_elf_asm_constructor (rtx symbol, int priority)
9475 {
9476   if (priority == DEFAULT_INIT_PRIORITY)
9477     default_ctor_section_asm_out_constructor (symbol, priority);
9478   else
9479     {
9480       section *s;
9481       /* While priority is known to be in range [0, 65535], so 18 bytes
9482          would be enough, the compiler might not know that.  To avoid
9483          -Wformat-truncation false positive, use a larger size.  */
9484       char buf[23];
9485       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
9486       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9487       switch_to_section (s);
9488       assemble_align (POINTER_SIZE);
9489       assemble_aligned_integer (POINTER_BYTES, symbol);
9490     }
9491 }
9492
9493 static void
9494 aarch64_elf_asm_destructor (rtx symbol, int priority)
9495 {
9496   if (priority == DEFAULT_INIT_PRIORITY)
9497     default_dtor_section_asm_out_destructor (symbol, priority);
9498   else
9499     {
9500       section *s;
9501       /* While priority is known to be in range [0, 65535], so 18 bytes
9502          would be enough, the compiler might not know that.  To avoid
9503          -Wformat-truncation false positive, use a larger size.  */
9504       char buf[23];
9505       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
9506       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9507       switch_to_section (s);
9508       assemble_align (POINTER_SIZE);
9509       assemble_aligned_integer (POINTER_BYTES, symbol);
9510     }
9511 }
9512
9513 const char*
9514 aarch64_output_casesi (rtx *operands)
9515 {
9516   char buf[100];
9517   char label[100];
9518   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
9519   int index;
9520   static const char *const patterns[4][2] =
9521   {
9522     {
9523       "ldrb\t%w3, [%0,%w1,uxtw]",
9524       "add\t%3, %4, %w3, sxtb #2"
9525     },
9526     {
9527       "ldrh\t%w3, [%0,%w1,uxtw #1]",
9528       "add\t%3, %4, %w3, sxth #2"
9529     },
9530     {
9531       "ldr\t%w3, [%0,%w1,uxtw #2]",
9532       "add\t%3, %4, %w3, sxtw #2"
9533     },
9534     /* We assume that DImode is only generated when not optimizing and
9535        that we don't really need 64-bit address offsets.  That would
9536        imply an object file with 8GB of code in a single function!  */
9537     {
9538       "ldr\t%w3, [%0,%w1,uxtw #2]",
9539       "add\t%3, %4, %w3, sxtw #2"
9540     }
9541   };
9542
9543   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
9544
9545   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
9546   index = exact_log2 (GET_MODE_SIZE (mode));
9547
9548   gcc_assert (index >= 0 && index <= 3);
9549
9550   /* Need to implement table size reduction, by chaning the code below.  */
9551   output_asm_insn (patterns[index][0], operands);
9552   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
9553   snprintf (buf, sizeof (buf),
9554             "adr\t%%4, %s", targetm.strip_name_encoding (label));
9555   output_asm_insn (buf, operands);
9556   output_asm_insn (patterns[index][1], operands);
9557   output_asm_insn ("br\t%3", operands);
9558   assemble_label (asm_out_file, label);
9559   return "";
9560 }
9561
9562
9563 /* Return size in bits of an arithmetic operand which is shifted/scaled and
9564    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
9565    operator.  */
9566
9567 int
9568 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
9569 {
9570   if (shift >= 0 && shift <= 3)
9571     {
9572       int size;
9573       for (size = 8; size <= 32; size *= 2)
9574         {
9575           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
9576           if (mask == bits << shift)
9577             return size;
9578         }
9579     }
9580   return 0;
9581 }
9582
9583 /* Constant pools are per function only when PC relative
9584    literal loads are true or we are in the large memory
9585    model.  */
9586
9587 static inline bool
9588 aarch64_can_use_per_function_literal_pools_p (void)
9589 {
9590   return (aarch64_pcrelative_literal_loads
9591           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
9592 }
9593
9594 static bool
9595 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
9596 {
9597   /* We can't use blocks for constants when we're using a per-function
9598      constant pool.  */
9599   return !aarch64_can_use_per_function_literal_pools_p ();
9600 }
9601
9602 /* Select appropriate section for constants depending
9603    on where we place literal pools.  */
9604
9605 static section *
9606 aarch64_select_rtx_section (machine_mode mode,
9607                             rtx x,
9608                             unsigned HOST_WIDE_INT align)
9609 {
9610   if (aarch64_can_use_per_function_literal_pools_p ())
9611     return function_section (current_function_decl);
9612
9613   return default_elf_select_rtx_section (mode, x, align);
9614 }
9615
9616 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
9617 void
9618 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
9619                                   HOST_WIDE_INT offset)
9620 {
9621   /* When using per-function literal pools, we must ensure that any code
9622      section is aligned to the minimal instruction length, lest we get
9623      errors from the assembler re "unaligned instructions".  */
9624   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
9625     ASM_OUTPUT_ALIGN (f, 2);
9626 }
9627
9628 /* Costs.  */
9629
9630 /* Helper function for rtx cost calculation.  Strip a shift expression
9631    from X.  Returns the inner operand if successful, or the original
9632    expression on failure.  */
9633 static rtx
9634 aarch64_strip_shift (rtx x)
9635 {
9636   rtx op = x;
9637
9638   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
9639      we can convert both to ROR during final output.  */
9640   if ((GET_CODE (op) == ASHIFT
9641        || GET_CODE (op) == ASHIFTRT
9642        || GET_CODE (op) == LSHIFTRT
9643        || GET_CODE (op) == ROTATERT
9644        || GET_CODE (op) == ROTATE)
9645       && CONST_INT_P (XEXP (op, 1)))
9646     return XEXP (op, 0);
9647
9648   if (GET_CODE (op) == MULT
9649       && CONST_INT_P (XEXP (op, 1))
9650       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
9651     return XEXP (op, 0);
9652
9653   return x;
9654 }
9655
9656 /* Helper function for rtx cost calculation.  Strip an extend
9657    expression from X.  Returns the inner operand if successful, or the
9658    original expression on failure.  We deal with a number of possible
9659    canonicalization variations here. If STRIP_SHIFT is true, then
9660    we can strip off a shift also.  */
9661 static rtx
9662 aarch64_strip_extend (rtx x, bool strip_shift)
9663 {
9664   scalar_int_mode mode;
9665   rtx op = x;
9666
9667   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
9668     return op;
9669
9670   /* Zero and sign extraction of a widened value.  */
9671   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
9672       && XEXP (op, 2) == const0_rtx
9673       && GET_CODE (XEXP (op, 0)) == MULT
9674       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
9675                                          XEXP (op, 1)))
9676     return XEXP (XEXP (op, 0), 0);
9677
9678   /* It can also be represented (for zero-extend) as an AND with an
9679      immediate.  */
9680   if (GET_CODE (op) == AND
9681       && GET_CODE (XEXP (op, 0)) == MULT
9682       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
9683       && CONST_INT_P (XEXP (op, 1))
9684       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
9685                            INTVAL (XEXP (op, 1))) != 0)
9686     return XEXP (XEXP (op, 0), 0);
9687
9688   /* Now handle extended register, as this may also have an optional
9689      left shift by 1..4.  */
9690   if (strip_shift
9691       && GET_CODE (op) == ASHIFT
9692       && CONST_INT_P (XEXP (op, 1))
9693       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
9694     op = XEXP (op, 0);
9695
9696   if (GET_CODE (op) == ZERO_EXTEND
9697       || GET_CODE (op) == SIGN_EXTEND)
9698     op = XEXP (op, 0);
9699
9700   if (op != x)
9701     return op;
9702
9703   return x;
9704 }
9705
9706 /* Return true iff CODE is a shift supported in combination
9707    with arithmetic instructions.  */
9708
9709 static bool
9710 aarch64_shift_p (enum rtx_code code)
9711 {
9712   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
9713 }
9714
9715
9716 /* Return true iff X is a cheap shift without a sign extend. */
9717
9718 static bool
9719 aarch64_cheap_mult_shift_p (rtx x)
9720 {
9721   rtx op0, op1;
9722
9723   op0 = XEXP (x, 0);
9724   op1 = XEXP (x, 1);
9725
9726   if (!(aarch64_tune_params.extra_tuning_flags
9727                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
9728     return false;
9729
9730   if (GET_CODE (op0) == SIGN_EXTEND)
9731     return false;
9732
9733   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
9734       && UINTVAL (op1) <= 4)
9735     return true;
9736
9737   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
9738     return false;
9739
9740   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
9741
9742   if (l2 > 0 && l2 <= 4)
9743     return true;
9744
9745   return false;
9746 }
9747
9748 /* Helper function for rtx cost calculation.  Calculate the cost of
9749    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
9750    Return the calculated cost of the expression, recursing manually in to
9751    operands where needed.  */
9752
9753 static int
9754 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
9755 {
9756   rtx op0, op1;
9757   const struct cpu_cost_table *extra_cost
9758     = aarch64_tune_params.insn_extra_cost;
9759   int cost = 0;
9760   bool compound_p = (outer == PLUS || outer == MINUS);
9761   machine_mode mode = GET_MODE (x);
9762
9763   gcc_checking_assert (code == MULT);
9764
9765   op0 = XEXP (x, 0);
9766   op1 = XEXP (x, 1);
9767
9768   if (VECTOR_MODE_P (mode))
9769     mode = GET_MODE_INNER (mode);
9770
9771   /* Integer multiply/fma.  */
9772   if (GET_MODE_CLASS (mode) == MODE_INT)
9773     {
9774       /* The multiply will be canonicalized as a shift, cost it as such.  */
9775       if (aarch64_shift_p (GET_CODE (x))
9776           || (CONST_INT_P (op1)
9777               && exact_log2 (INTVAL (op1)) > 0))
9778         {
9779           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
9780                            || GET_CODE (op0) == SIGN_EXTEND;
9781           if (speed)
9782             {
9783               if (compound_p)
9784                 {
9785                   /* If the shift is considered cheap,
9786                      then don't add any cost. */
9787                   if (aarch64_cheap_mult_shift_p (x))
9788                     ;
9789                   else if (REG_P (op1))
9790                     /* ARITH + shift-by-register.  */
9791                     cost += extra_cost->alu.arith_shift_reg;
9792                   else if (is_extend)
9793                     /* ARITH + extended register.  We don't have a cost field
9794                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
9795                     cost += extra_cost->alu.extend_arith;
9796                   else
9797                     /* ARITH + shift-by-immediate.  */
9798                     cost += extra_cost->alu.arith_shift;
9799                 }
9800               else
9801                 /* LSL (immediate).  */
9802                 cost += extra_cost->alu.shift;
9803
9804             }
9805           /* Strip extends as we will have costed them in the case above.  */
9806           if (is_extend)
9807             op0 = aarch64_strip_extend (op0, true);
9808
9809           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
9810
9811           return cost;
9812         }
9813
9814       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
9815          compound and let the below cases handle it.  After all, MNEG is a
9816          special-case alias of MSUB.  */
9817       if (GET_CODE (op0) == NEG)
9818         {
9819           op0 = XEXP (op0, 0);
9820           compound_p = true;
9821         }
9822
9823       /* Integer multiplies or FMAs have zero/sign extending variants.  */
9824       if ((GET_CODE (op0) == ZERO_EXTEND
9825            && GET_CODE (op1) == ZERO_EXTEND)
9826           || (GET_CODE (op0) == SIGN_EXTEND
9827               && GET_CODE (op1) == SIGN_EXTEND))
9828         {
9829           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
9830           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
9831
9832           if (speed)
9833             {
9834               if (compound_p)
9835                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
9836                 cost += extra_cost->mult[0].extend_add;
9837               else
9838                 /* MUL/SMULL/UMULL.  */
9839                 cost += extra_cost->mult[0].extend;
9840             }
9841
9842           return cost;
9843         }
9844
9845       /* This is either an integer multiply or a MADD.  In both cases
9846          we want to recurse and cost the operands.  */
9847       cost += rtx_cost (op0, mode, MULT, 0, speed);
9848       cost += rtx_cost (op1, mode, MULT, 1, speed);
9849
9850       if (speed)
9851         {
9852           if (compound_p)
9853             /* MADD/MSUB.  */
9854             cost += extra_cost->mult[mode == DImode].add;
9855           else
9856             /* MUL.  */
9857             cost += extra_cost->mult[mode == DImode].simple;
9858         }
9859
9860       return cost;
9861     }
9862   else
9863     {
9864       if (speed)
9865         {
9866           /* Floating-point FMA/FMUL can also support negations of the
9867              operands, unless the rounding mode is upward or downward in
9868              which case FNMUL is different than FMUL with operand negation.  */
9869           bool neg0 = GET_CODE (op0) == NEG;
9870           bool neg1 = GET_CODE (op1) == NEG;
9871           if (compound_p || !flag_rounding_math || (neg0 && neg1))
9872             {
9873               if (neg0)
9874                 op0 = XEXP (op0, 0);
9875               if (neg1)
9876                 op1 = XEXP (op1, 0);
9877             }
9878
9879           if (compound_p)
9880             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
9881             cost += extra_cost->fp[mode == DFmode].fma;
9882           else
9883             /* FMUL/FNMUL.  */
9884             cost += extra_cost->fp[mode == DFmode].mult;
9885         }
9886
9887       cost += rtx_cost (op0, mode, MULT, 0, speed);
9888       cost += rtx_cost (op1, mode, MULT, 1, speed);
9889       return cost;
9890     }
9891 }
9892
9893 static int
9894 aarch64_address_cost (rtx x,
9895                       machine_mode mode,
9896                       addr_space_t as ATTRIBUTE_UNUSED,
9897                       bool speed)
9898 {
9899   enum rtx_code c = GET_CODE (x);
9900   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
9901   struct aarch64_address_info info;
9902   int cost = 0;
9903   info.shift = 0;
9904
9905   if (!aarch64_classify_address (&info, x, mode, false))
9906     {
9907       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
9908         {
9909           /* This is a CONST or SYMBOL ref which will be split
9910              in a different way depending on the code model in use.
9911              Cost it through the generic infrastructure.  */
9912           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
9913           /* Divide through by the cost of one instruction to
9914              bring it to the same units as the address costs.  */
9915           cost_symbol_ref /= COSTS_N_INSNS (1);
9916           /* The cost is then the cost of preparing the address,
9917              followed by an immediate (possibly 0) offset.  */
9918           return cost_symbol_ref + addr_cost->imm_offset;
9919         }
9920       else
9921         {
9922           /* This is most likely a jump table from a case
9923              statement.  */
9924           return addr_cost->register_offset;
9925         }
9926     }
9927
9928   switch (info.type)
9929     {
9930       case ADDRESS_LO_SUM:
9931       case ADDRESS_SYMBOLIC:
9932       case ADDRESS_REG_IMM:
9933         cost += addr_cost->imm_offset;
9934         break;
9935
9936       case ADDRESS_REG_WB:
9937         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
9938           cost += addr_cost->pre_modify;
9939         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
9940           cost += addr_cost->post_modify;
9941         else
9942           gcc_unreachable ();
9943
9944         break;
9945
9946       case ADDRESS_REG_REG:
9947         cost += addr_cost->register_offset;
9948         break;
9949
9950       case ADDRESS_REG_SXTW:
9951         cost += addr_cost->register_sextend;
9952         break;
9953
9954       case ADDRESS_REG_UXTW:
9955         cost += addr_cost->register_zextend;
9956         break;
9957
9958       default:
9959         gcc_unreachable ();
9960     }
9961
9962
9963   if (info.shift > 0)
9964     {
9965       /* For the sake of calculating the cost of the shifted register
9966          component, we can treat same sized modes in the same way.  */
9967       if (known_eq (GET_MODE_BITSIZE (mode), 16))
9968         cost += addr_cost->addr_scale_costs.hi;
9969       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9970         cost += addr_cost->addr_scale_costs.si;
9971       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9972         cost += addr_cost->addr_scale_costs.di;
9973       else
9974         /* We can't tell, or this is a 128-bit vector.  */
9975         cost += addr_cost->addr_scale_costs.ti;
9976     }
9977
9978   return cost;
9979 }
9980
9981 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
9982    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
9983    to be taken.  */
9984
9985 int
9986 aarch64_branch_cost (bool speed_p, bool predictable_p)
9987 {
9988   /* When optimizing for speed, use the cost of unpredictable branches.  */
9989   const struct cpu_branch_cost *branch_costs =
9990     aarch64_tune_params.branch_costs;
9991
9992   if (!speed_p || predictable_p)
9993     return branch_costs->predictable;
9994   else
9995     return branch_costs->unpredictable;
9996 }
9997
9998 /* Return true if the RTX X in mode MODE is a zero or sign extract
9999    usable in an ADD or SUB (extended register) instruction.  */
10000 static bool
10001 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
10002 {
10003   /* Catch add with a sign extract.
10004      This is add_<optab><mode>_multp2.  */
10005   if (GET_CODE (x) == SIGN_EXTRACT
10006       || GET_CODE (x) == ZERO_EXTRACT)
10007     {
10008       rtx op0 = XEXP (x, 0);
10009       rtx op1 = XEXP (x, 1);
10010       rtx op2 = XEXP (x, 2);
10011
10012       if (GET_CODE (op0) == MULT
10013           && CONST_INT_P (op1)
10014           && op2 == const0_rtx
10015           && CONST_INT_P (XEXP (op0, 1))
10016           && aarch64_is_extend_from_extract (mode,
10017                                              XEXP (op0, 1),
10018                                              op1))
10019         {
10020           return true;
10021         }
10022     }
10023   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
10024      No shift.  */
10025   else if (GET_CODE (x) == SIGN_EXTEND
10026            || GET_CODE (x) == ZERO_EXTEND)
10027     return REG_P (XEXP (x, 0));
10028
10029   return false;
10030 }
10031
10032 static bool
10033 aarch64_frint_unspec_p (unsigned int u)
10034 {
10035   switch (u)
10036     {
10037       case UNSPEC_FRINTZ:
10038       case UNSPEC_FRINTP:
10039       case UNSPEC_FRINTM:
10040       case UNSPEC_FRINTA:
10041       case UNSPEC_FRINTN:
10042       case UNSPEC_FRINTX:
10043       case UNSPEC_FRINTI:
10044         return true;
10045
10046       default:
10047         return false;
10048     }
10049 }
10050
10051 /* Return true iff X is an rtx that will match an extr instruction
10052    i.e. as described in the *extr<mode>5_insn family of patterns.
10053    OP0 and OP1 will be set to the operands of the shifts involved
10054    on success and will be NULL_RTX otherwise.  */
10055
10056 static bool
10057 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
10058 {
10059   rtx op0, op1;
10060   scalar_int_mode mode;
10061   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
10062     return false;
10063
10064   *res_op0 = NULL_RTX;
10065   *res_op1 = NULL_RTX;
10066
10067   if (GET_CODE (x) != IOR)
10068     return false;
10069
10070   op0 = XEXP (x, 0);
10071   op1 = XEXP (x, 1);
10072
10073   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
10074       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
10075     {
10076      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
10077       if (GET_CODE (op1) == ASHIFT)
10078         std::swap (op0, op1);
10079
10080       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
10081         return false;
10082
10083       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
10084       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
10085
10086       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
10087           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
10088         {
10089           *res_op0 = XEXP (op0, 0);
10090           *res_op1 = XEXP (op1, 0);
10091           return true;
10092         }
10093     }
10094
10095   return false;
10096 }
10097
10098 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
10099    storing it in *COST.  Result is true if the total cost of the operation
10100    has now been calculated.  */
10101 static bool
10102 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
10103 {
10104   rtx inner;
10105   rtx comparator;
10106   enum rtx_code cmpcode;
10107
10108   if (COMPARISON_P (op0))
10109     {
10110       inner = XEXP (op0, 0);
10111       comparator = XEXP (op0, 1);
10112       cmpcode = GET_CODE (op0);
10113     }
10114   else
10115     {
10116       inner = op0;
10117       comparator = const0_rtx;
10118       cmpcode = NE;
10119     }
10120
10121   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
10122     {
10123       /* Conditional branch.  */
10124       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10125         return true;
10126       else
10127         {
10128           if (cmpcode == NE || cmpcode == EQ)
10129             {
10130               if (comparator == const0_rtx)
10131                 {
10132                   /* TBZ/TBNZ/CBZ/CBNZ.  */
10133                   if (GET_CODE (inner) == ZERO_EXTRACT)
10134                     /* TBZ/TBNZ.  */
10135                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
10136                                        ZERO_EXTRACT, 0, speed);
10137                   else
10138                     /* CBZ/CBNZ.  */
10139                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
10140
10141                 return true;
10142               }
10143             }
10144           else if (cmpcode == LT || cmpcode == GE)
10145             {
10146               /* TBZ/TBNZ.  */
10147               if (comparator == const0_rtx)
10148                 return true;
10149             }
10150         }
10151     }
10152   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10153     {
10154       /* CCMP.  */
10155       if (GET_CODE (op1) == COMPARE)
10156         {
10157           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
10158           if (XEXP (op1, 1) == const0_rtx)
10159             *cost += 1;
10160           if (speed)
10161             {
10162               machine_mode mode = GET_MODE (XEXP (op1, 0));
10163               const struct cpu_cost_table *extra_cost
10164                 = aarch64_tune_params.insn_extra_cost;
10165
10166               if (GET_MODE_CLASS (mode) == MODE_INT)
10167                 *cost += extra_cost->alu.arith;
10168               else
10169                 *cost += extra_cost->fp[mode == DFmode].compare;
10170             }
10171           return true;
10172         }
10173
10174       /* It's a conditional operation based on the status flags,
10175          so it must be some flavor of CSEL.  */
10176
10177       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
10178       if (GET_CODE (op1) == NEG
10179           || GET_CODE (op1) == NOT
10180           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
10181         op1 = XEXP (op1, 0);
10182       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
10183         {
10184           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
10185           op1 = XEXP (op1, 0);
10186           op2 = XEXP (op2, 0);
10187         }
10188
10189       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
10190       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
10191       return true;
10192     }
10193
10194   /* We don't know what this is, cost all operands.  */
10195   return false;
10196 }
10197
10198 /* Check whether X is a bitfield operation of the form shift + extend that
10199    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
10200    operand to which the bitfield operation is applied.  Otherwise return
10201    NULL_RTX.  */
10202
10203 static rtx
10204 aarch64_extend_bitfield_pattern_p (rtx x)
10205 {
10206   rtx_code outer_code = GET_CODE (x);
10207   machine_mode outer_mode = GET_MODE (x);
10208
10209   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
10210       && outer_mode != SImode && outer_mode != DImode)
10211     return NULL_RTX;
10212
10213   rtx inner = XEXP (x, 0);
10214   rtx_code inner_code = GET_CODE (inner);
10215   machine_mode inner_mode = GET_MODE (inner);
10216   rtx op = NULL_RTX;
10217
10218   switch (inner_code)
10219     {
10220       case ASHIFT:
10221         if (CONST_INT_P (XEXP (inner, 1))
10222             && (inner_mode == QImode || inner_mode == HImode))
10223           op = XEXP (inner, 0);
10224         break;
10225       case LSHIFTRT:
10226         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
10227             && (inner_mode == QImode || inner_mode == HImode))
10228           op = XEXP (inner, 0);
10229         break;
10230       case ASHIFTRT:
10231         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
10232             && (inner_mode == QImode || inner_mode == HImode))
10233           op = XEXP (inner, 0);
10234         break;
10235       default:
10236         break;
10237     }
10238
10239   return op;
10240 }
10241
10242 /* Return true if the mask and a shift amount from an RTX of the form
10243    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
10244    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
10245
10246 bool
10247 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
10248                                     rtx shft_amnt)
10249 {
10250   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
10251          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
10252          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
10253          && (INTVAL (mask)
10254              & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
10255 }
10256
10257 /* Return true if the masks and a shift amount from an RTX of the form
10258    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
10259    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
10260
10261 bool
10262 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
10263                                    unsigned HOST_WIDE_INT mask1,
10264                                    unsigned HOST_WIDE_INT shft_amnt,
10265                                    unsigned HOST_WIDE_INT mask2)
10266 {
10267   unsigned HOST_WIDE_INT t;
10268
10269   /* Verify that there is no overlap in what bits are set in the two masks.  */
10270   if (mask1 != ~mask2)
10271     return false;
10272
10273   /* Verify that mask2 is not all zeros or ones.  */
10274   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
10275     return false;
10276
10277   /* The shift amount should always be less than the mode size.  */
10278   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
10279
10280   /* Verify that the mask being shifted is contiguous and would be in the
10281      least significant bits after shifting by shft_amnt.  */
10282   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
10283   return (t == (t & -t));
10284 }
10285
10286 /* Calculate the cost of calculating X, storing it in *COST.  Result
10287    is true if the total cost of the operation has now been calculated.  */
10288 static bool
10289 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
10290                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
10291 {
10292   rtx op0, op1, op2;
10293   const struct cpu_cost_table *extra_cost
10294     = aarch64_tune_params.insn_extra_cost;
10295   int code = GET_CODE (x);
10296   scalar_int_mode int_mode;
10297
10298   /* By default, assume that everything has equivalent cost to the
10299      cheapest instruction.  Any additional costs are applied as a delta
10300      above this default.  */
10301   *cost = COSTS_N_INSNS (1);
10302
10303   switch (code)
10304     {
10305     case SET:
10306       /* The cost depends entirely on the operands to SET.  */
10307       *cost = 0;
10308       op0 = SET_DEST (x);
10309       op1 = SET_SRC (x);
10310
10311       switch (GET_CODE (op0))
10312         {
10313         case MEM:
10314           if (speed)
10315             {
10316               rtx address = XEXP (op0, 0);
10317               if (VECTOR_MODE_P (mode))
10318                 *cost += extra_cost->ldst.storev;
10319               else if (GET_MODE_CLASS (mode) == MODE_INT)
10320                 *cost += extra_cost->ldst.store;
10321               else if (mode == SFmode)
10322                 *cost += extra_cost->ldst.storef;
10323               else if (mode == DFmode)
10324                 *cost += extra_cost->ldst.stored;
10325
10326               *cost +=
10327                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10328                                                      0, speed));
10329             }
10330
10331           *cost += rtx_cost (op1, mode, SET, 1, speed);
10332           return true;
10333
10334         case SUBREG:
10335           if (! REG_P (SUBREG_REG (op0)))
10336             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
10337
10338           /* Fall through.  */
10339         case REG:
10340           /* The cost is one per vector-register copied.  */
10341           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
10342             {
10343               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
10344               *cost = COSTS_N_INSNS (nregs);
10345             }
10346           /* const0_rtx is in general free, but we will use an
10347              instruction to set a register to 0.  */
10348           else if (REG_P (op1) || op1 == const0_rtx)
10349             {
10350               /* The cost is 1 per register copied.  */
10351               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
10352               *cost = COSTS_N_INSNS (nregs);
10353             }
10354           else
10355             /* Cost is just the cost of the RHS of the set.  */
10356             *cost += rtx_cost (op1, mode, SET, 1, speed);
10357           return true;
10358
10359         case ZERO_EXTRACT:
10360         case SIGN_EXTRACT:
10361           /* Bit-field insertion.  Strip any redundant widening of
10362              the RHS to meet the width of the target.  */
10363           if (GET_CODE (op1) == SUBREG)
10364             op1 = SUBREG_REG (op1);
10365           if ((GET_CODE (op1) == ZERO_EXTEND
10366                || GET_CODE (op1) == SIGN_EXTEND)
10367               && CONST_INT_P (XEXP (op0, 1))
10368               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
10369               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
10370             op1 = XEXP (op1, 0);
10371
10372           if (CONST_INT_P (op1))
10373             {
10374               /* MOV immediate is assumed to always be cheap.  */
10375               *cost = COSTS_N_INSNS (1);
10376             }
10377           else
10378             {
10379               /* BFM.  */
10380               if (speed)
10381                 *cost += extra_cost->alu.bfi;
10382               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
10383             }
10384
10385           return true;
10386
10387         default:
10388           /* We can't make sense of this, assume default cost.  */
10389           *cost = COSTS_N_INSNS (1);
10390           return false;
10391         }
10392       return false;
10393
10394     case CONST_INT:
10395       /* If an instruction can incorporate a constant within the
10396          instruction, the instruction's expression avoids calling
10397          rtx_cost() on the constant.  If rtx_cost() is called on a
10398          constant, then it is usually because the constant must be
10399          moved into a register by one or more instructions.
10400
10401          The exception is constant 0, which can be expressed
10402          as XZR/WZR and is therefore free.  The exception to this is
10403          if we have (set (reg) (const0_rtx)) in which case we must cost
10404          the move.  However, we can catch that when we cost the SET, so
10405          we don't need to consider that here.  */
10406       if (x == const0_rtx)
10407         *cost = 0;
10408       else
10409         {
10410           /* To an approximation, building any other constant is
10411              proportionally expensive to the number of instructions
10412              required to build that constant.  This is true whether we
10413              are compiling for SPEED or otherwise.  */
10414           if (!is_a <scalar_int_mode> (mode, &int_mode))
10415             int_mode = word_mode;
10416           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
10417                                  (NULL_RTX, x, false, int_mode));
10418         }
10419       return true;
10420
10421     case CONST_DOUBLE:
10422
10423       /* First determine number of instructions to do the move
10424           as an integer constant.  */
10425       if (!aarch64_float_const_representable_p (x)
10426            && !aarch64_can_const_movi_rtx_p (x, mode)
10427            && aarch64_float_const_rtx_p (x))
10428         {
10429           unsigned HOST_WIDE_INT ival;
10430           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
10431           gcc_assert (succeed);
10432
10433           scalar_int_mode imode = (mode == HFmode
10434                                    ? SImode
10435                                    : int_mode_for_mode (mode).require ());
10436           int ncost = aarch64_internal_mov_immediate
10437                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
10438           *cost += COSTS_N_INSNS (ncost);
10439           return true;
10440         }
10441
10442       if (speed)
10443         {
10444           /* mov[df,sf]_aarch64.  */
10445           if (aarch64_float_const_representable_p (x))
10446             /* FMOV (scalar immediate).  */
10447             *cost += extra_cost->fp[mode == DFmode].fpconst;
10448           else if (!aarch64_float_const_zero_rtx_p (x))
10449             {
10450               /* This will be a load from memory.  */
10451               if (mode == DFmode)
10452                 *cost += extra_cost->ldst.loadd;
10453               else
10454                 *cost += extra_cost->ldst.loadf;
10455             }
10456           else
10457             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
10458                or MOV v0.s[0], wzr - neither of which are modeled by the
10459                cost tables.  Just use the default cost.  */
10460             {
10461             }
10462         }
10463
10464       return true;
10465
10466     case MEM:
10467       if (speed)
10468         {
10469           /* For loads we want the base cost of a load, plus an
10470              approximation for the additional cost of the addressing
10471              mode.  */
10472           rtx address = XEXP (x, 0);
10473           if (VECTOR_MODE_P (mode))
10474             *cost += extra_cost->ldst.loadv;
10475           else if (GET_MODE_CLASS (mode) == MODE_INT)
10476             *cost += extra_cost->ldst.load;
10477           else if (mode == SFmode)
10478             *cost += extra_cost->ldst.loadf;
10479           else if (mode == DFmode)
10480             *cost += extra_cost->ldst.loadd;
10481
10482           *cost +=
10483                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10484                                                      0, speed));
10485         }
10486
10487       return true;
10488
10489     case NEG:
10490       op0 = XEXP (x, 0);
10491
10492       if (VECTOR_MODE_P (mode))
10493         {
10494           if (speed)
10495             {
10496               /* FNEG.  */
10497               *cost += extra_cost->vect.alu;
10498             }
10499           return false;
10500         }
10501
10502       if (GET_MODE_CLASS (mode) == MODE_INT)
10503         {
10504           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10505               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10506             {
10507               /* CSETM.  */
10508               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
10509               return true;
10510             }
10511
10512           /* Cost this as SUB wzr, X.  */
10513           op0 = CONST0_RTX (mode);
10514           op1 = XEXP (x, 0);
10515           goto cost_minus;
10516         }
10517
10518       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10519         {
10520           /* Support (neg(fma...)) as a single instruction only if
10521              sign of zeros is unimportant.  This matches the decision
10522              making in aarch64.md.  */
10523           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
10524             {
10525               /* FNMADD.  */
10526               *cost = rtx_cost (op0, mode, NEG, 0, speed);
10527               return true;
10528             }
10529           if (GET_CODE (op0) == MULT)
10530             {
10531               /* FNMUL.  */
10532               *cost = rtx_cost (op0, mode, NEG, 0, speed);
10533               return true;
10534             }
10535           if (speed)
10536             /* FNEG.  */
10537             *cost += extra_cost->fp[mode == DFmode].neg;
10538           return false;
10539         }
10540
10541       return false;
10542
10543     case CLRSB:
10544     case CLZ:
10545       if (speed)
10546         {
10547           if (VECTOR_MODE_P (mode))
10548             *cost += extra_cost->vect.alu;
10549           else
10550             *cost += extra_cost->alu.clz;
10551         }
10552
10553       return false;
10554
10555     case COMPARE:
10556       op0 = XEXP (x, 0);
10557       op1 = XEXP (x, 1);
10558
10559       if (op1 == const0_rtx
10560           && GET_CODE (op0) == AND)
10561         {
10562           x = op0;
10563           mode = GET_MODE (op0);
10564           goto cost_logic;
10565         }
10566
10567       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
10568         {
10569           /* TODO: A write to the CC flags possibly costs extra, this
10570              needs encoding in the cost tables.  */
10571
10572           mode = GET_MODE (op0);
10573           /* ANDS.  */
10574           if (GET_CODE (op0) == AND)
10575             {
10576               x = op0;
10577               goto cost_logic;
10578             }
10579
10580           if (GET_CODE (op0) == PLUS)
10581             {
10582               /* ADDS (and CMN alias).  */
10583               x = op0;
10584               goto cost_plus;
10585             }
10586
10587           if (GET_CODE (op0) == MINUS)
10588             {
10589               /* SUBS.  */
10590               x = op0;
10591               goto cost_minus;
10592             }
10593
10594           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
10595               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
10596               && CONST_INT_P (XEXP (op0, 2)))
10597             {
10598               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
10599                  Handle it here directly rather than going to cost_logic
10600                  since we know the immediate generated for the TST is valid
10601                  so we can avoid creating an intermediate rtx for it only
10602                  for costing purposes.  */
10603               if (speed)
10604                 *cost += extra_cost->alu.logical;
10605
10606               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
10607                                  ZERO_EXTRACT, 0, speed);
10608               return true;
10609             }
10610
10611           if (GET_CODE (op1) == NEG)
10612             {
10613               /* CMN.  */
10614               if (speed)
10615                 *cost += extra_cost->alu.arith;
10616
10617               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
10618               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
10619               return true;
10620             }
10621
10622           /* CMP.
10623
10624              Compare can freely swap the order of operands, and
10625              canonicalization puts the more complex operation first.
10626              But the integer MINUS logic expects the shift/extend
10627              operation in op1.  */
10628           if (! (REG_P (op0)
10629                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
10630           {
10631             op0 = XEXP (x, 1);
10632             op1 = XEXP (x, 0);
10633           }
10634           goto cost_minus;
10635         }
10636
10637       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
10638         {
10639           /* FCMP.  */
10640           if (speed)
10641             *cost += extra_cost->fp[mode == DFmode].compare;
10642
10643           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
10644             {
10645               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
10646               /* FCMP supports constant 0.0 for no extra cost. */
10647               return true;
10648             }
10649           return false;
10650         }
10651
10652       if (VECTOR_MODE_P (mode))
10653         {
10654           /* Vector compare.  */
10655           if (speed)
10656             *cost += extra_cost->vect.alu;
10657
10658           if (aarch64_float_const_zero_rtx_p (op1))
10659             {
10660               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
10661                  cost.  */
10662               return true;
10663             }
10664           return false;
10665         }
10666       return false;
10667
10668     case MINUS:
10669       {
10670         op0 = XEXP (x, 0);
10671         op1 = XEXP (x, 1);
10672
10673 cost_minus:
10674         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
10675
10676         /* Detect valid immediates.  */
10677         if ((GET_MODE_CLASS (mode) == MODE_INT
10678              || (GET_MODE_CLASS (mode) == MODE_CC
10679                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
10680             && CONST_INT_P (op1)
10681             && aarch64_uimm12_shift (INTVAL (op1)))
10682           {
10683             if (speed)
10684               /* SUB(S) (immediate).  */
10685               *cost += extra_cost->alu.arith;
10686             return true;
10687           }
10688
10689         /* Look for SUB (extended register).  */
10690         if (is_a <scalar_int_mode> (mode, &int_mode)
10691             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
10692           {
10693             if (speed)
10694               *cost += extra_cost->alu.extend_arith;
10695
10696             op1 = aarch64_strip_extend (op1, true);
10697             *cost += rtx_cost (op1, VOIDmode,
10698                                (enum rtx_code) GET_CODE (op1), 0, speed);
10699             return true;
10700           }
10701
10702         rtx new_op1 = aarch64_strip_extend (op1, false);
10703
10704         /* Cost this as an FMA-alike operation.  */
10705         if ((GET_CODE (new_op1) == MULT
10706              || aarch64_shift_p (GET_CODE (new_op1)))
10707             && code != COMPARE)
10708           {
10709             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
10710                                             (enum rtx_code) code,
10711                                             speed);
10712             return true;
10713           }
10714
10715         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
10716
10717         if (speed)
10718           {
10719             if (VECTOR_MODE_P (mode))
10720               {
10721                 /* Vector SUB.  */
10722                 *cost += extra_cost->vect.alu;
10723               }
10724             else if (GET_MODE_CLASS (mode) == MODE_INT)
10725               {
10726                 /* SUB(S).  */
10727                 *cost += extra_cost->alu.arith;
10728               }
10729             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10730               {
10731                 /* FSUB.  */
10732                 *cost += extra_cost->fp[mode == DFmode].addsub;
10733               }
10734           }
10735         return true;
10736       }
10737
10738     case PLUS:
10739       {
10740         rtx new_op0;
10741
10742         op0 = XEXP (x, 0);
10743         op1 = XEXP (x, 1);
10744
10745 cost_plus:
10746         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10747             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10748           {
10749             /* CSINC.  */
10750             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
10751             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10752             return true;
10753           }
10754
10755         if (GET_MODE_CLASS (mode) == MODE_INT
10756             && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
10757                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
10758           {
10759             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
10760
10761             if (speed)
10762               /* ADD (immediate).  */
10763               *cost += extra_cost->alu.arith;
10764             return true;
10765           }
10766
10767         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10768
10769         /* Look for ADD (extended register).  */
10770         if (is_a <scalar_int_mode> (mode, &int_mode)
10771             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
10772           {
10773             if (speed)
10774               *cost += extra_cost->alu.extend_arith;
10775
10776             op0 = aarch64_strip_extend (op0, true);
10777             *cost += rtx_cost (op0, VOIDmode,
10778                                (enum rtx_code) GET_CODE (op0), 0, speed);
10779             return true;
10780           }
10781
10782         /* Strip any extend, leave shifts behind as we will
10783            cost them through mult_cost.  */
10784         new_op0 = aarch64_strip_extend (op0, false);
10785
10786         if (GET_CODE (new_op0) == MULT
10787             || aarch64_shift_p (GET_CODE (new_op0)))
10788           {
10789             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
10790                                             speed);
10791             return true;
10792           }
10793
10794         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
10795
10796         if (speed)
10797           {
10798             if (VECTOR_MODE_P (mode))
10799               {
10800                 /* Vector ADD.  */
10801                 *cost += extra_cost->vect.alu;
10802               }
10803             else if (GET_MODE_CLASS (mode) == MODE_INT)
10804               {
10805                 /* ADD.  */
10806                 *cost += extra_cost->alu.arith;
10807               }
10808             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10809               {
10810                 /* FADD.  */
10811                 *cost += extra_cost->fp[mode == DFmode].addsub;
10812               }
10813           }
10814         return true;
10815       }
10816
10817     case BSWAP:
10818       *cost = COSTS_N_INSNS (1);
10819
10820       if (speed)
10821         {
10822           if (VECTOR_MODE_P (mode))
10823             *cost += extra_cost->vect.alu;
10824           else
10825             *cost += extra_cost->alu.rev;
10826         }
10827       return false;
10828
10829     case IOR:
10830       if (aarch_rev16_p (x))
10831         {
10832           *cost = COSTS_N_INSNS (1);
10833
10834           if (speed)
10835             {
10836               if (VECTOR_MODE_P (mode))
10837                 *cost += extra_cost->vect.alu;
10838               else
10839                 *cost += extra_cost->alu.rev;
10840             }
10841           return true;
10842         }
10843
10844       if (aarch64_extr_rtx_p (x, &op0, &op1))
10845         {
10846           *cost += rtx_cost (op0, mode, IOR, 0, speed);
10847           *cost += rtx_cost (op1, mode, IOR, 1, speed);
10848           if (speed)
10849             *cost += extra_cost->alu.shift;
10850
10851           return true;
10852         }
10853     /* Fall through.  */
10854     case XOR:
10855     case AND:
10856     cost_logic:
10857       op0 = XEXP (x, 0);
10858       op1 = XEXP (x, 1);
10859
10860       if (VECTOR_MODE_P (mode))
10861         {
10862           if (speed)
10863             *cost += extra_cost->vect.alu;
10864           return true;
10865         }
10866
10867       if (code == AND
10868           && GET_CODE (op0) == MULT
10869           && CONST_INT_P (XEXP (op0, 1))
10870           && CONST_INT_P (op1)
10871           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
10872                                INTVAL (op1)) != 0)
10873         {
10874           /* This is a UBFM/SBFM.  */
10875           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
10876           if (speed)
10877             *cost += extra_cost->alu.bfx;
10878           return true;
10879         }
10880
10881       if (is_int_mode (mode, &int_mode))
10882         {
10883           if (CONST_INT_P (op1))
10884             {
10885               /* We have a mask + shift version of a UBFIZ
10886                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
10887               if (GET_CODE (op0) == ASHIFT
10888                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
10889                                                          XEXP (op0, 1)))
10890                 {
10891                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
10892                                      (enum rtx_code) code, 0, speed);
10893                   if (speed)
10894                     *cost += extra_cost->alu.bfx;
10895
10896                   return true;
10897                 }
10898               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
10899                 {
10900                 /* We possibly get the immediate for free, this is not
10901                    modelled.  */
10902                   *cost += rtx_cost (op0, int_mode,
10903                                      (enum rtx_code) code, 0, speed);
10904                   if (speed)
10905                     *cost += extra_cost->alu.logical;
10906
10907                   return true;
10908                 }
10909             }
10910           else
10911             {
10912               rtx new_op0 = op0;
10913
10914               /* Handle ORN, EON, or BIC.  */
10915               if (GET_CODE (op0) == NOT)
10916                 op0 = XEXP (op0, 0);
10917
10918               new_op0 = aarch64_strip_shift (op0);
10919
10920               /* If we had a shift on op0 then this is a logical-shift-
10921                  by-register/immediate operation.  Otherwise, this is just
10922                  a logical operation.  */
10923               if (speed)
10924                 {
10925                   if (new_op0 != op0)
10926                     {
10927                       /* Shift by immediate.  */
10928                       if (CONST_INT_P (XEXP (op0, 1)))
10929                         *cost += extra_cost->alu.log_shift;
10930                       else
10931                         *cost += extra_cost->alu.log_shift_reg;
10932                     }
10933                   else
10934                     *cost += extra_cost->alu.logical;
10935                 }
10936
10937               /* In both cases we want to cost both operands.  */
10938               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
10939                                  0, speed);
10940               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
10941                                  1, speed);
10942
10943               return true;
10944             }
10945         }
10946       return false;
10947
10948     case NOT:
10949       x = XEXP (x, 0);
10950       op0 = aarch64_strip_shift (x);
10951
10952       if (VECTOR_MODE_P (mode))
10953         {
10954           /* Vector NOT.  */
10955           *cost += extra_cost->vect.alu;
10956           return false;
10957         }
10958
10959       /* MVN-shifted-reg.  */
10960       if (op0 != x)
10961         {
10962           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10963
10964           if (speed)
10965             *cost += extra_cost->alu.log_shift;
10966
10967           return true;
10968         }
10969       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10970          Handle the second form here taking care that 'a' in the above can
10971          be a shift.  */
10972       else if (GET_CODE (op0) == XOR)
10973         {
10974           rtx newop0 = XEXP (op0, 0);
10975           rtx newop1 = XEXP (op0, 1);
10976           rtx op0_stripped = aarch64_strip_shift (newop0);
10977
10978           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
10979           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
10980
10981           if (speed)
10982             {
10983               if (op0_stripped != newop0)
10984                 *cost += extra_cost->alu.log_shift;
10985               else
10986                 *cost += extra_cost->alu.logical;
10987             }
10988
10989           return true;
10990         }
10991       /* MVN.  */
10992       if (speed)
10993         *cost += extra_cost->alu.logical;
10994
10995       return false;
10996
10997     case ZERO_EXTEND:
10998
10999       op0 = XEXP (x, 0);
11000       /* If a value is written in SI mode, then zero extended to DI
11001          mode, the operation will in general be free as a write to
11002          a 'w' register implicitly zeroes the upper bits of an 'x'
11003          register.  However, if this is
11004
11005            (set (reg) (zero_extend (reg)))
11006
11007          we must cost the explicit register move.  */
11008       if (mode == DImode
11009           && GET_MODE (op0) == SImode
11010           && outer == SET)
11011         {
11012           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
11013
11014         /* If OP_COST is non-zero, then the cost of the zero extend
11015            is effectively the cost of the inner operation.  Otherwise
11016            we have a MOV instruction and we take the cost from the MOV
11017            itself.  This is true independently of whether we are
11018            optimizing for space or time.  */
11019           if (op_cost)
11020             *cost = op_cost;
11021
11022           return true;
11023         }
11024       else if (MEM_P (op0))
11025         {
11026           /* All loads can zero extend to any size for free.  */
11027           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
11028           return true;
11029         }
11030
11031       op0 = aarch64_extend_bitfield_pattern_p (x);
11032       if (op0)
11033         {
11034           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
11035           if (speed)
11036             *cost += extra_cost->alu.bfx;
11037           return true;
11038         }
11039
11040       if (speed)
11041         {
11042           if (VECTOR_MODE_P (mode))
11043             {
11044               /* UMOV.  */
11045               *cost += extra_cost->vect.alu;
11046             }
11047           else
11048             {
11049               /* We generate an AND instead of UXTB/UXTH.  */
11050               *cost += extra_cost->alu.logical;
11051             }
11052         }
11053       return false;
11054
11055     case SIGN_EXTEND:
11056       if (MEM_P (XEXP (x, 0)))
11057         {
11058           /* LDRSH.  */
11059           if (speed)
11060             {
11061               rtx address = XEXP (XEXP (x, 0), 0);
11062               *cost += extra_cost->ldst.load_sign_extend;
11063
11064               *cost +=
11065                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11066                                                      0, speed));
11067             }
11068           return true;
11069         }
11070
11071       op0 = aarch64_extend_bitfield_pattern_p (x);
11072       if (op0)
11073         {
11074           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
11075           if (speed)
11076             *cost += extra_cost->alu.bfx;
11077           return true;
11078         }
11079
11080       if (speed)
11081         {
11082           if (VECTOR_MODE_P (mode))
11083             *cost += extra_cost->vect.alu;
11084           else
11085             *cost += extra_cost->alu.extend;
11086         }
11087       return false;
11088
11089     case ASHIFT:
11090       op0 = XEXP (x, 0);
11091       op1 = XEXP (x, 1);
11092
11093       if (CONST_INT_P (op1))
11094         {
11095           if (speed)
11096             {
11097               if (VECTOR_MODE_P (mode))
11098                 {
11099                   /* Vector shift (immediate).  */
11100                   *cost += extra_cost->vect.alu;
11101                 }
11102               else
11103                 {
11104                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
11105                      aliases.  */
11106                   *cost += extra_cost->alu.shift;
11107                 }
11108             }
11109
11110           /* We can incorporate zero/sign extend for free.  */
11111           if (GET_CODE (op0) == ZERO_EXTEND
11112               || GET_CODE (op0) == SIGN_EXTEND)
11113             op0 = XEXP (op0, 0);
11114
11115           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
11116           return true;
11117         }
11118       else
11119         {
11120           if (VECTOR_MODE_P (mode))
11121             {
11122               if (speed)
11123                 /* Vector shift (register).  */
11124                 *cost += extra_cost->vect.alu;
11125             }
11126           else
11127             {
11128               if (speed)
11129                 /* LSLV.  */
11130                 *cost += extra_cost->alu.shift_reg;
11131
11132               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11133                   && CONST_INT_P (XEXP (op1, 1))
11134                   && known_eq (INTVAL (XEXP (op1, 1)),
11135                                GET_MODE_BITSIZE (mode) - 1))
11136                 {
11137                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11138                   /* We already demanded XEXP (op1, 0) to be REG_P, so
11139                      don't recurse into it.  */
11140                   return true;
11141                 }
11142             }
11143           return false;  /* All arguments need to be in registers.  */
11144         }
11145
11146     case ROTATE:
11147     case ROTATERT:
11148     case LSHIFTRT:
11149     case ASHIFTRT:
11150       op0 = XEXP (x, 0);
11151       op1 = XEXP (x, 1);
11152
11153       if (CONST_INT_P (op1))
11154         {
11155           /* ASR (immediate) and friends.  */
11156           if (speed)
11157             {
11158               if (VECTOR_MODE_P (mode))
11159                 *cost += extra_cost->vect.alu;
11160               else
11161                 *cost += extra_cost->alu.shift;
11162             }
11163
11164           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
11165           return true;
11166         }
11167       else
11168         {
11169           if (VECTOR_MODE_P (mode))
11170             {
11171               if (speed)
11172                 /* Vector shift (register).  */
11173                 *cost += extra_cost->vect.alu;
11174             }
11175           else
11176             {
11177               if (speed)
11178                 /* ASR (register) and friends.  */
11179                 *cost += extra_cost->alu.shift_reg;
11180
11181               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11182                   && CONST_INT_P (XEXP (op1, 1))
11183                   && known_eq (INTVAL (XEXP (op1, 1)),
11184                                GET_MODE_BITSIZE (mode) - 1))
11185                 {
11186                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11187                   /* We already demanded XEXP (op1, 0) to be REG_P, so
11188                      don't recurse into it.  */
11189                   return true;
11190                 }
11191             }
11192           return false;  /* All arguments need to be in registers.  */
11193         }
11194
11195     case SYMBOL_REF:
11196
11197       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
11198           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
11199         {
11200           /* LDR.  */
11201           if (speed)
11202             *cost += extra_cost->ldst.load;
11203         }
11204       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
11205                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
11206         {
11207           /* ADRP, followed by ADD.  */
11208           *cost += COSTS_N_INSNS (1);
11209           if (speed)
11210             *cost += 2 * extra_cost->alu.arith;
11211         }
11212       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
11213                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11214         {
11215           /* ADR.  */
11216           if (speed)
11217             *cost += extra_cost->alu.arith;
11218         }
11219
11220       if (flag_pic)
11221         {
11222           /* One extra load instruction, after accessing the GOT.  */
11223           *cost += COSTS_N_INSNS (1);
11224           if (speed)
11225             *cost += extra_cost->ldst.load;
11226         }
11227       return true;
11228
11229     case HIGH:
11230     case LO_SUM:
11231       /* ADRP/ADD (immediate).  */
11232       if (speed)
11233         *cost += extra_cost->alu.arith;
11234       return true;
11235
11236     case ZERO_EXTRACT:
11237     case SIGN_EXTRACT:
11238       /* UBFX/SBFX.  */
11239       if (speed)
11240         {
11241           if (VECTOR_MODE_P (mode))
11242             *cost += extra_cost->vect.alu;
11243           else
11244             *cost += extra_cost->alu.bfx;
11245         }
11246
11247       /* We can trust that the immediates used will be correct (there
11248          are no by-register forms), so we need only cost op0.  */
11249       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
11250       return true;
11251
11252     case MULT:
11253       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
11254       /* aarch64_rtx_mult_cost always handles recursion to its
11255          operands.  */
11256       return true;
11257
11258     case MOD:
11259     /* We can expand signed mod by power of 2 using a NEGS, two parallel
11260        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
11261        an unconditional negate.  This case should only ever be reached through
11262        the set_smod_pow2_cheap check in expmed.c.  */
11263       if (CONST_INT_P (XEXP (x, 1))
11264           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
11265           && (mode == SImode || mode == DImode))
11266         {
11267           /* We expand to 4 instructions.  Reset the baseline.  */
11268           *cost = COSTS_N_INSNS (4);
11269
11270           if (speed)
11271             *cost += 2 * extra_cost->alu.logical
11272                      + 2 * extra_cost->alu.arith;
11273
11274           return true;
11275         }
11276
11277     /* Fall-through.  */
11278     case UMOD:
11279       if (speed)
11280         {
11281           /* Slighly prefer UMOD over SMOD.  */
11282           if (VECTOR_MODE_P (mode))
11283             *cost += extra_cost->vect.alu;
11284           else if (GET_MODE_CLASS (mode) == MODE_INT)
11285             *cost += (extra_cost->mult[mode == DImode].add
11286                       + extra_cost->mult[mode == DImode].idiv
11287                       + (code == MOD ? 1 : 0));
11288         }
11289       return false;  /* All arguments need to be in registers.  */
11290
11291     case DIV:
11292     case UDIV:
11293     case SQRT:
11294       if (speed)
11295         {
11296           if (VECTOR_MODE_P (mode))
11297             *cost += extra_cost->vect.alu;
11298           else if (GET_MODE_CLASS (mode) == MODE_INT)
11299             /* There is no integer SQRT, so only DIV and UDIV can get
11300                here.  */
11301             *cost += (extra_cost->mult[mode == DImode].idiv
11302                      /* Slighly prefer UDIV over SDIV.  */
11303                      + (code == DIV ? 1 : 0));
11304           else
11305             *cost += extra_cost->fp[mode == DFmode].div;
11306         }
11307       return false;  /* All arguments need to be in registers.  */
11308
11309     case IF_THEN_ELSE:
11310       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
11311                                          XEXP (x, 2), cost, speed);
11312
11313     case EQ:
11314     case NE:
11315     case GT:
11316     case GTU:
11317     case LT:
11318     case LTU:
11319     case GE:
11320     case GEU:
11321     case LE:
11322     case LEU:
11323
11324       return false; /* All arguments must be in registers.  */
11325
11326     case FMA:
11327       op0 = XEXP (x, 0);
11328       op1 = XEXP (x, 1);
11329       op2 = XEXP (x, 2);
11330
11331       if (speed)
11332         {
11333           if (VECTOR_MODE_P (mode))
11334             *cost += extra_cost->vect.alu;
11335           else
11336             *cost += extra_cost->fp[mode == DFmode].fma;
11337         }
11338
11339       /* FMSUB, FNMADD, and FNMSUB are free.  */
11340       if (GET_CODE (op0) == NEG)
11341         op0 = XEXP (op0, 0);
11342
11343       if (GET_CODE (op2) == NEG)
11344         op2 = XEXP (op2, 0);
11345
11346       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
11347          and the by-element operand as operand 0.  */
11348       if (GET_CODE (op1) == NEG)
11349         op1 = XEXP (op1, 0);
11350
11351       /* Catch vector-by-element operations.  The by-element operand can
11352          either be (vec_duplicate (vec_select (x))) or just
11353          (vec_select (x)), depending on whether we are multiplying by
11354          a vector or a scalar.
11355
11356          Canonicalization is not very good in these cases, FMA4 will put the
11357          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
11358       if (GET_CODE (op0) == VEC_DUPLICATE)
11359         op0 = XEXP (op0, 0);
11360       else if (GET_CODE (op1) == VEC_DUPLICATE)
11361         op1 = XEXP (op1, 0);
11362
11363       if (GET_CODE (op0) == VEC_SELECT)
11364         op0 = XEXP (op0, 0);
11365       else if (GET_CODE (op1) == VEC_SELECT)
11366         op1 = XEXP (op1, 0);
11367
11368       /* If the remaining parameters are not registers,
11369          get the cost to put them into registers.  */
11370       *cost += rtx_cost (op0, mode, FMA, 0, speed);
11371       *cost += rtx_cost (op1, mode, FMA, 1, speed);
11372       *cost += rtx_cost (op2, mode, FMA, 2, speed);
11373       return true;
11374
11375     case FLOAT:
11376     case UNSIGNED_FLOAT:
11377       if (speed)
11378         *cost += extra_cost->fp[mode == DFmode].fromint;
11379       return false;
11380
11381     case FLOAT_EXTEND:
11382       if (speed)
11383         {
11384           if (VECTOR_MODE_P (mode))
11385             {
11386               /*Vector truncate.  */
11387               *cost += extra_cost->vect.alu;
11388             }
11389           else
11390             *cost += extra_cost->fp[mode == DFmode].widen;
11391         }
11392       return false;
11393
11394     case FLOAT_TRUNCATE:
11395       if (speed)
11396         {
11397           if (VECTOR_MODE_P (mode))
11398             {
11399               /*Vector conversion.  */
11400               *cost += extra_cost->vect.alu;
11401             }
11402           else
11403             *cost += extra_cost->fp[mode == DFmode].narrow;
11404         }
11405       return false;
11406
11407     case FIX:
11408     case UNSIGNED_FIX:
11409       x = XEXP (x, 0);
11410       /* Strip the rounding part.  They will all be implemented
11411          by the fcvt* family of instructions anyway.  */
11412       if (GET_CODE (x) == UNSPEC)
11413         {
11414           unsigned int uns_code = XINT (x, 1);
11415
11416           if (uns_code == UNSPEC_FRINTA
11417               || uns_code == UNSPEC_FRINTM
11418               || uns_code == UNSPEC_FRINTN
11419               || uns_code == UNSPEC_FRINTP
11420               || uns_code == UNSPEC_FRINTZ)
11421             x = XVECEXP (x, 0, 0);
11422         }
11423
11424       if (speed)
11425         {
11426           if (VECTOR_MODE_P (mode))
11427             *cost += extra_cost->vect.alu;
11428           else
11429             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
11430         }
11431
11432       /* We can combine fmul by a power of 2 followed by a fcvt into a single
11433          fixed-point fcvt.  */
11434       if (GET_CODE (x) == MULT
11435           && ((VECTOR_MODE_P (mode)
11436                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
11437               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
11438         {
11439           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
11440                              0, speed);
11441           return true;
11442         }
11443
11444       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
11445       return true;
11446
11447     case ABS:
11448       if (VECTOR_MODE_P (mode))
11449         {
11450           /* ABS (vector).  */
11451           if (speed)
11452             *cost += extra_cost->vect.alu;
11453         }
11454       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11455         {
11456           op0 = XEXP (x, 0);
11457
11458           /* FABD, which is analogous to FADD.  */
11459           if (GET_CODE (op0) == MINUS)
11460             {
11461               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
11462               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
11463               if (speed)
11464                 *cost += extra_cost->fp[mode == DFmode].addsub;
11465
11466               return true;
11467             }
11468           /* Simple FABS is analogous to FNEG.  */
11469           if (speed)
11470             *cost += extra_cost->fp[mode == DFmode].neg;
11471         }
11472       else
11473         {
11474           /* Integer ABS will either be split to
11475              two arithmetic instructions, or will be an ABS
11476              (scalar), which we don't model.  */
11477           *cost = COSTS_N_INSNS (2);
11478           if (speed)
11479             *cost += 2 * extra_cost->alu.arith;
11480         }
11481       return false;
11482
11483     case SMAX:
11484     case SMIN:
11485       if (speed)
11486         {
11487           if (VECTOR_MODE_P (mode))
11488             *cost += extra_cost->vect.alu;
11489           else
11490             {
11491               /* FMAXNM/FMINNM/FMAX/FMIN.
11492                  TODO: This may not be accurate for all implementations, but
11493                  we do not model this in the cost tables.  */
11494               *cost += extra_cost->fp[mode == DFmode].addsub;
11495             }
11496         }
11497       return false;
11498
11499     case UNSPEC:
11500       /* The floating point round to integer frint* instructions.  */
11501       if (aarch64_frint_unspec_p (XINT (x, 1)))
11502         {
11503           if (speed)
11504             *cost += extra_cost->fp[mode == DFmode].roundint;
11505
11506           return false;
11507         }
11508
11509       if (XINT (x, 1) == UNSPEC_RBIT)
11510         {
11511           if (speed)
11512             *cost += extra_cost->alu.rev;
11513
11514           return false;
11515         }
11516       break;
11517
11518     case TRUNCATE:
11519
11520       /* Decompose <su>muldi3_highpart.  */
11521       if (/* (truncate:DI  */
11522           mode == DImode
11523           /*   (lshiftrt:TI  */
11524           && GET_MODE (XEXP (x, 0)) == TImode
11525           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
11526           /*      (mult:TI  */
11527           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
11528           /*        (ANY_EXTEND:TI (reg:DI))
11529                     (ANY_EXTEND:TI (reg:DI)))  */
11530           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
11531                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
11532               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
11533                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
11534           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
11535           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
11536           /*     (const_int 64)  */
11537           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
11538           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
11539         {
11540           /* UMULH/SMULH.  */
11541           if (speed)
11542             *cost += extra_cost->mult[mode == DImode].extend;
11543           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
11544                              mode, MULT, 0, speed);
11545           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
11546                              mode, MULT, 1, speed);
11547           return true;
11548         }
11549
11550       /* Fall through.  */
11551     default:
11552       break;
11553     }
11554
11555   if (dump_file
11556       && flag_aarch64_verbose_cost)
11557     fprintf (dump_file,
11558       "\nFailed to cost RTX.  Assuming default cost.\n");
11559
11560   return true;
11561 }
11562
11563 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
11564    calculated for X.  This cost is stored in *COST.  Returns true
11565    if the total cost of X was calculated.  */
11566 static bool
11567 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
11568                    int param, int *cost, bool speed)
11569 {
11570   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
11571
11572   if (dump_file
11573       && flag_aarch64_verbose_cost)
11574     {
11575       print_rtl_single (dump_file, x);
11576       fprintf (dump_file, "\n%s cost: %d (%s)\n",
11577                speed ? "Hot" : "Cold",
11578                *cost, result ? "final" : "partial");
11579     }
11580
11581   return result;
11582 }
11583
11584 static int
11585 aarch64_register_move_cost (machine_mode mode,
11586                             reg_class_t from_i, reg_class_t to_i)
11587 {
11588   enum reg_class from = (enum reg_class) from_i;
11589   enum reg_class to = (enum reg_class) to_i;
11590   const struct cpu_regmove_cost *regmove_cost
11591     = aarch64_tune_params.regmove_cost;
11592
11593   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
11594   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
11595     to = GENERAL_REGS;
11596
11597   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
11598     from = GENERAL_REGS;
11599
11600   /* Moving between GPR and stack cost is the same as GP2GP.  */
11601   if ((from == GENERAL_REGS && to == STACK_REG)
11602       || (to == GENERAL_REGS && from == STACK_REG))
11603     return regmove_cost->GP2GP;
11604
11605   /* To/From the stack register, we move via the gprs.  */
11606   if (to == STACK_REG || from == STACK_REG)
11607     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
11608             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
11609
11610   if (known_eq (GET_MODE_SIZE (mode), 16))
11611     {
11612       /* 128-bit operations on general registers require 2 instructions.  */
11613       if (from == GENERAL_REGS && to == GENERAL_REGS)
11614         return regmove_cost->GP2GP * 2;
11615       else if (from == GENERAL_REGS)
11616         return regmove_cost->GP2FP * 2;
11617       else if (to == GENERAL_REGS)
11618         return regmove_cost->FP2GP * 2;
11619
11620       /* When AdvSIMD instructions are disabled it is not possible to move
11621          a 128-bit value directly between Q registers.  This is handled in
11622          secondary reload.  A general register is used as a scratch to move
11623          the upper DI value and the lower DI value is moved directly,
11624          hence the cost is the sum of three moves. */
11625       if (! TARGET_SIMD)
11626         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
11627
11628       return regmove_cost->FP2FP;
11629     }
11630
11631   if (from == GENERAL_REGS && to == GENERAL_REGS)
11632     return regmove_cost->GP2GP;
11633   else if (from == GENERAL_REGS)
11634     return regmove_cost->GP2FP;
11635   else if (to == GENERAL_REGS)
11636     return regmove_cost->FP2GP;
11637
11638   return regmove_cost->FP2FP;
11639 }
11640
11641 static int
11642 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
11643                           reg_class_t rclass ATTRIBUTE_UNUSED,
11644                           bool in ATTRIBUTE_UNUSED)
11645 {
11646   return aarch64_tune_params.memmov_cost;
11647 }
11648
11649 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
11650    to optimize 1.0/sqrt.  */
11651
11652 static bool
11653 use_rsqrt_p (machine_mode mode)
11654 {
11655   return (!flag_trapping_math
11656           && flag_unsafe_math_optimizations
11657           && ((aarch64_tune_params.approx_modes->recip_sqrt
11658                & AARCH64_APPROX_MODE (mode))
11659               || flag_mrecip_low_precision_sqrt));
11660 }
11661
11662 /* Function to decide when to use the approximate reciprocal square root
11663    builtin.  */
11664
11665 static tree
11666 aarch64_builtin_reciprocal (tree fndecl)
11667 {
11668   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
11669
11670   if (!use_rsqrt_p (mode))
11671     return NULL_TREE;
11672   return aarch64_builtin_rsqrt (DECL_MD_FUNCTION_CODE (fndecl));
11673 }
11674
11675 /* Emit instruction sequence to compute either the approximate square root
11676    or its approximate reciprocal, depending on the flag RECP, and return
11677    whether the sequence was emitted or not.  */
11678
11679 bool
11680 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
11681 {
11682   machine_mode mode = GET_MODE (dst);
11683
11684   if (GET_MODE_INNER (mode) == HFmode)
11685     {
11686       gcc_assert (!recp);
11687       return false;
11688     }
11689
11690   if (!recp)
11691     {
11692       if (!(flag_mlow_precision_sqrt
11693             || (aarch64_tune_params.approx_modes->sqrt
11694                 & AARCH64_APPROX_MODE (mode))))
11695         return false;
11696
11697       if (flag_finite_math_only
11698           || flag_trapping_math
11699           || !flag_unsafe_math_optimizations
11700           || optimize_function_for_size_p (cfun))
11701         return false;
11702     }
11703   else
11704     /* Caller assumes we cannot fail.  */
11705     gcc_assert (use_rsqrt_p (mode));
11706
11707   machine_mode mmsk = mode_for_int_vector (mode).require ();
11708   rtx xmsk = gen_reg_rtx (mmsk);
11709   if (!recp)
11710     /* When calculating the approximate square root, compare the
11711        argument with 0.0 and create a mask.  */
11712     emit_insn (gen_rtx_SET (xmsk,
11713                             gen_rtx_NEG (mmsk,
11714                                          gen_rtx_EQ (mmsk, src,
11715                                                      CONST0_RTX (mode)))));
11716
11717   /* Estimate the approximate reciprocal square root.  */
11718   rtx xdst = gen_reg_rtx (mode);
11719   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
11720
11721   /* Iterate over the series twice for SF and thrice for DF.  */
11722   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11723
11724   /* Optionally iterate over the series once less for faster performance
11725      while sacrificing the accuracy.  */
11726   if ((recp && flag_mrecip_low_precision_sqrt)
11727       || (!recp && flag_mlow_precision_sqrt))
11728     iterations--;
11729
11730   /* Iterate over the series to calculate the approximate reciprocal square
11731      root.  */
11732   rtx x1 = gen_reg_rtx (mode);
11733   while (iterations--)
11734     {
11735       rtx x2 = gen_reg_rtx (mode);
11736       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
11737
11738       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
11739
11740       if (iterations > 0)
11741         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
11742     }
11743
11744   if (!recp)
11745     {
11746       /* Qualify the approximate reciprocal square root when the argument is
11747          0.0 by squashing the intermediary result to 0.0.  */
11748       rtx xtmp = gen_reg_rtx (mmsk);
11749       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
11750                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
11751       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
11752
11753       /* Calculate the approximate square root.  */
11754       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
11755     }
11756
11757   /* Finalize the approximation.  */
11758   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
11759
11760   return true;
11761 }
11762
11763 /* Emit the instruction sequence to compute the approximation for the division
11764    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
11765
11766 bool
11767 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
11768 {
11769   machine_mode mode = GET_MODE (quo);
11770
11771   if (GET_MODE_INNER (mode) == HFmode)
11772     return false;
11773
11774   bool use_approx_division_p = (flag_mlow_precision_div
11775                                 || (aarch64_tune_params.approx_modes->division
11776                                     & AARCH64_APPROX_MODE (mode)));
11777
11778   if (!flag_finite_math_only
11779       || flag_trapping_math
11780       || !flag_unsafe_math_optimizations
11781       || optimize_function_for_size_p (cfun)
11782       || !use_approx_division_p)
11783     return false;
11784
11785   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
11786     return false;
11787
11788   /* Estimate the approximate reciprocal.  */
11789   rtx xrcp = gen_reg_rtx (mode);
11790   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
11791
11792   /* Iterate over the series twice for SF and thrice for DF.  */
11793   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11794
11795   /* Optionally iterate over the series once less for faster performance,
11796      while sacrificing the accuracy.  */
11797   if (flag_mlow_precision_div)
11798     iterations--;
11799
11800   /* Iterate over the series to calculate the approximate reciprocal.  */
11801   rtx xtmp = gen_reg_rtx (mode);
11802   while (iterations--)
11803     {
11804       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
11805
11806       if (iterations > 0)
11807         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
11808     }
11809
11810   if (num != CONST1_RTX (mode))
11811     {
11812       /* As the approximate reciprocal of DEN is already calculated, only
11813          calculate the approximate division when NUM is not 1.0.  */
11814       rtx xnum = force_reg (mode, num);
11815       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
11816     }
11817
11818   /* Finalize the approximation.  */
11819   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
11820   return true;
11821 }
11822
11823 /* Return the number of instructions that can be issued per cycle.  */
11824 static int
11825 aarch64_sched_issue_rate (void)
11826 {
11827   return aarch64_tune_params.issue_rate;
11828 }
11829
11830 /* Implement TARGET_SCHED_VARIABLE_ISSUE.  */
11831 static int
11832 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
11833 {
11834   if (DEBUG_INSN_P (insn))
11835     return more;
11836
11837   rtx_code code = GET_CODE (PATTERN (insn));
11838   if (code == USE || code == CLOBBER)
11839     return more;
11840
11841   if (get_attr_type (insn) == TYPE_NO_INSN)
11842     return more;
11843
11844   return more - 1;
11845 }
11846
11847 static int
11848 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
11849 {
11850   int issue_rate = aarch64_sched_issue_rate ();
11851
11852   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
11853 }
11854
11855
11856 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
11857    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
11858    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
11859
11860 static int
11861 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
11862                                                     int ready_index)
11863 {
11864   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
11865 }
11866
11867
11868 /* Vectorizer cost model target hooks.  */
11869
11870 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
11871 static int
11872 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
11873                                     tree vectype,
11874                                     int misalign ATTRIBUTE_UNUSED)
11875 {
11876   unsigned elements;
11877   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
11878   bool fp = false;
11879
11880   if (vectype != NULL)
11881     fp = FLOAT_TYPE_P (vectype);
11882
11883   switch (type_of_cost)
11884     {
11885       case scalar_stmt:
11886         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
11887
11888       case scalar_load:
11889         return costs->scalar_load_cost;
11890
11891       case scalar_store:
11892         return costs->scalar_store_cost;
11893
11894       case vector_stmt:
11895         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11896
11897       case vector_load:
11898         return costs->vec_align_load_cost;
11899
11900       case vector_store:
11901         return costs->vec_store_cost;
11902
11903       case vec_to_scalar:
11904         return costs->vec_to_scalar_cost;
11905
11906       case scalar_to_vec:
11907         return costs->scalar_to_vec_cost;
11908
11909       case unaligned_load:
11910       case vector_gather_load:
11911         return costs->vec_unalign_load_cost;
11912
11913       case unaligned_store:
11914       case vector_scatter_store:
11915         return costs->vec_unalign_store_cost;
11916
11917       case cond_branch_taken:
11918         return costs->cond_taken_branch_cost;
11919
11920       case cond_branch_not_taken:
11921         return costs->cond_not_taken_branch_cost;
11922
11923       case vec_perm:
11924         return costs->vec_permute_cost;
11925
11926       case vec_promote_demote:
11927         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11928
11929       case vec_construct:
11930         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
11931         return elements / 2 + 1;
11932
11933       default:
11934         gcc_unreachable ();
11935     }
11936 }
11937
11938 /* Implement targetm.vectorize.add_stmt_cost.  */
11939 static unsigned
11940 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
11941                        struct _stmt_vec_info *stmt_info, int misalign,
11942                        enum vect_cost_model_location where)
11943 {
11944   unsigned *cost = (unsigned *) data;
11945   unsigned retval = 0;
11946
11947   if (flag_vect_cost_model)
11948     {
11949       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
11950       int stmt_cost =
11951             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
11952
11953       /* Statements in an inner loop relative to the loop being
11954          vectorized are weighted more heavily.  The value here is
11955          arbitrary and could potentially be improved with analysis.  */
11956       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
11957         count *= 50; /*  FIXME  */
11958
11959       retval = (unsigned) (count * stmt_cost);
11960       cost[where] += retval;
11961     }
11962
11963   return retval;
11964 }
11965
11966 static void initialize_aarch64_code_model (struct gcc_options *);
11967
11968 /* Parse the TO_PARSE string and put the architecture struct that it
11969    selects into RES and the architectural features into ISA_FLAGS.
11970    Return an aarch64_parse_opt_result describing the parse result.
11971    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11972    When the TO_PARSE string contains an invalid extension,
11973    a copy of the string is created and stored to INVALID_EXTENSION.  */
11974
11975 static enum aarch64_parse_opt_result
11976 aarch64_parse_arch (const char *to_parse, const struct processor **res,
11977                     uint64_t *isa_flags, std::string *invalid_extension)
11978 {
11979   const char *ext;
11980   const struct processor *arch;
11981   size_t len;
11982
11983   ext = strchr (to_parse, '+');
11984
11985   if (ext != NULL)
11986     len = ext - to_parse;
11987   else
11988     len = strlen (to_parse);
11989
11990   if (len == 0)
11991     return AARCH64_PARSE_MISSING_ARG;
11992
11993
11994   /* Loop through the list of supported ARCHes to find a match.  */
11995   for (arch = all_architectures; arch->name != NULL; arch++)
11996     {
11997       if (strlen (arch->name) == len
11998           && strncmp (arch->name, to_parse, len) == 0)
11999         {
12000           uint64_t isa_temp = arch->flags;
12001
12002           if (ext != NULL)
12003             {
12004               /* TO_PARSE string contains at least one extension.  */
12005               enum aarch64_parse_opt_result ext_res
12006                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
12007
12008               if (ext_res != AARCH64_PARSE_OK)
12009                 return ext_res;
12010             }
12011           /* Extension parsing was successful.  Confirm the result
12012              arch and ISA flags.  */
12013           *res = arch;
12014           *isa_flags = isa_temp;
12015           return AARCH64_PARSE_OK;
12016         }
12017     }
12018
12019   /* ARCH name not found in list.  */
12020   return AARCH64_PARSE_INVALID_ARG;
12021 }
12022
12023 /* Parse the TO_PARSE string and put the result tuning in RES and the
12024    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
12025    describing the parse result.  If there is an error parsing, RES and
12026    ISA_FLAGS are left unchanged.
12027    When the TO_PARSE string contains an invalid extension,
12028    a copy of the string is created and stored to INVALID_EXTENSION.  */
12029
12030 static enum aarch64_parse_opt_result
12031 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
12032                    uint64_t *isa_flags, std::string *invalid_extension)
12033 {
12034   const char *ext;
12035   const struct processor *cpu;
12036   size_t len;
12037
12038   ext = strchr (to_parse, '+');
12039
12040   if (ext != NULL)
12041     len = ext - to_parse;
12042   else
12043     len = strlen (to_parse);
12044
12045   if (len == 0)
12046     return AARCH64_PARSE_MISSING_ARG;
12047
12048
12049   /* Loop through the list of supported CPUs to find a match.  */
12050   for (cpu = all_cores; cpu->name != NULL; cpu++)
12051     {
12052       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
12053         {
12054           uint64_t isa_temp = cpu->flags;
12055
12056
12057           if (ext != NULL)
12058             {
12059               /* TO_PARSE string contains at least one extension.  */
12060               enum aarch64_parse_opt_result ext_res
12061                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
12062
12063               if (ext_res != AARCH64_PARSE_OK)
12064                 return ext_res;
12065             }
12066           /* Extension parsing was successfull.  Confirm the result
12067              cpu and ISA flags.  */
12068           *res = cpu;
12069           *isa_flags = isa_temp;
12070           return AARCH64_PARSE_OK;
12071         }
12072     }
12073
12074   /* CPU name not found in list.  */
12075   return AARCH64_PARSE_INVALID_ARG;
12076 }
12077
12078 /* Parse the TO_PARSE string and put the cpu it selects into RES.
12079    Return an aarch64_parse_opt_result describing the parse result.
12080    If the parsing fails the RES does not change.  */
12081
12082 static enum aarch64_parse_opt_result
12083 aarch64_parse_tune (const char *to_parse, const struct processor **res)
12084 {
12085   const struct processor *cpu;
12086
12087   /* Loop through the list of supported CPUs to find a match.  */
12088   for (cpu = all_cores; cpu->name != NULL; cpu++)
12089     {
12090       if (strcmp (cpu->name, to_parse) == 0)
12091         {
12092           *res = cpu;
12093           return AARCH64_PARSE_OK;
12094         }
12095     }
12096
12097   /* CPU name not found in list.  */
12098   return AARCH64_PARSE_INVALID_ARG;
12099 }
12100
12101 /* Parse TOKEN, which has length LENGTH to see if it is an option
12102    described in FLAG.  If it is, return the index bit for that fusion type.
12103    If not, error (printing OPTION_NAME) and return zero.  */
12104
12105 static unsigned int
12106 aarch64_parse_one_option_token (const char *token,
12107                                 size_t length,
12108                                 const struct aarch64_flag_desc *flag,
12109                                 const char *option_name)
12110 {
12111   for (; flag->name != NULL; flag++)
12112     {
12113       if (length == strlen (flag->name)
12114           && !strncmp (flag->name, token, length))
12115         return flag->flag;
12116     }
12117
12118   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
12119   return 0;
12120 }
12121
12122 /* Parse OPTION which is a comma-separated list of flags to enable.
12123    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
12124    default state we inherit from the CPU tuning structures.  OPTION_NAME
12125    gives the top-level option we are parsing in the -moverride string,
12126    for use in error messages.  */
12127
12128 static unsigned int
12129 aarch64_parse_boolean_options (const char *option,
12130                                const struct aarch64_flag_desc *flags,
12131                                unsigned int initial_state,
12132                                const char *option_name)
12133 {
12134   const char separator = '.';
12135   const char* specs = option;
12136   const char* ntoken = option;
12137   unsigned int found_flags = initial_state;
12138
12139   while ((ntoken = strchr (specs, separator)))
12140     {
12141       size_t token_length = ntoken - specs;
12142       unsigned token_ops = aarch64_parse_one_option_token (specs,
12143                                                            token_length,
12144                                                            flags,
12145                                                            option_name);
12146       /* If we find "none" (or, for simplicity's sake, an error) anywhere
12147          in the token stream, reset the supported operations.  So:
12148
12149            adrp+add.cmp+branch.none.adrp+add
12150
12151            would have the result of turning on only adrp+add fusion.  */
12152       if (!token_ops)
12153         found_flags = 0;
12154
12155       found_flags |= token_ops;
12156       specs = ++ntoken;
12157     }
12158
12159   /* We ended with a comma, print something.  */
12160   if (!(*specs))
12161     {
12162       error ("%s string ill-formed\n", option_name);
12163       return 0;
12164     }
12165
12166   /* We still have one more token to parse.  */
12167   size_t token_length = strlen (specs);
12168   unsigned token_ops = aarch64_parse_one_option_token (specs,
12169                                                        token_length,
12170                                                        flags,
12171                                                        option_name);
12172    if (!token_ops)
12173      found_flags = 0;
12174
12175   found_flags |= token_ops;
12176   return found_flags;
12177 }
12178
12179 /* Support for overriding instruction fusion.  */
12180
12181 static void
12182 aarch64_parse_fuse_string (const char *fuse_string,
12183                             struct tune_params *tune)
12184 {
12185   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
12186                                                      aarch64_fusible_pairs,
12187                                                      tune->fusible_ops,
12188                                                      "fuse=");
12189 }
12190
12191 /* Support for overriding other tuning flags.  */
12192
12193 static void
12194 aarch64_parse_tune_string (const char *tune_string,
12195                             struct tune_params *tune)
12196 {
12197   tune->extra_tuning_flags
12198     = aarch64_parse_boolean_options (tune_string,
12199                                      aarch64_tuning_flags,
12200                                      tune->extra_tuning_flags,
12201                                      "tune=");
12202 }
12203
12204 /* Parse the sve_width tuning moverride string in TUNE_STRING.
12205    Accept the valid SVE vector widths allowed by
12206    aarch64_sve_vector_bits_enum and use it to override sve_width
12207    in TUNE.  */
12208
12209 static void
12210 aarch64_parse_sve_width_string (const char *tune_string,
12211                                 struct tune_params *tune)
12212 {
12213   int width = -1;
12214
12215   int n = sscanf (tune_string, "%d", &width);
12216   if (n == EOF)
12217     {
12218       error ("invalid format for sve_width");
12219       return;
12220     }
12221   switch (width)
12222     {
12223     case SVE_128:
12224     case SVE_256:
12225     case SVE_512:
12226     case SVE_1024:
12227     case SVE_2048:
12228       break;
12229     default:
12230       error ("invalid sve_width value: %d", width);
12231     }
12232   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
12233 }
12234
12235 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
12236    we understand.  If it is, extract the option string and handoff to
12237    the appropriate function.  */
12238
12239 void
12240 aarch64_parse_one_override_token (const char* token,
12241                                   size_t length,
12242                                   struct tune_params *tune)
12243 {
12244   const struct aarch64_tuning_override_function *fn
12245     = aarch64_tuning_override_functions;
12246
12247   const char *option_part = strchr (token, '=');
12248   if (!option_part)
12249     {
12250       error ("tuning string missing in option (%s)", token);
12251       return;
12252     }
12253
12254   /* Get the length of the option name.  */
12255   length = option_part - token;
12256   /* Skip the '=' to get to the option string.  */
12257   option_part++;
12258
12259   for (; fn->name != NULL; fn++)
12260     {
12261       if (!strncmp (fn->name, token, length))
12262         {
12263           fn->parse_override (option_part, tune);
12264           return;
12265         }
12266     }
12267
12268   error ("unknown tuning option (%s)",token);
12269   return;
12270 }
12271
12272 /* A checking mechanism for the implementation of the tls size.  */
12273
12274 static void
12275 initialize_aarch64_tls_size (struct gcc_options *opts)
12276 {
12277   if (aarch64_tls_size == 0)
12278     aarch64_tls_size = 24;
12279
12280   switch (opts->x_aarch64_cmodel_var)
12281     {
12282     case AARCH64_CMODEL_TINY:
12283       /* Both the default and maximum TLS size allowed under tiny is 1M which
12284          needs two instructions to address, so we clamp the size to 24.  */
12285       if (aarch64_tls_size > 24)
12286         aarch64_tls_size = 24;
12287       break;
12288     case AARCH64_CMODEL_SMALL:
12289       /* The maximum TLS size allowed under small is 4G.  */
12290       if (aarch64_tls_size > 32)
12291         aarch64_tls_size = 32;
12292       break;
12293     case AARCH64_CMODEL_LARGE:
12294       /* The maximum TLS size allowed under large is 16E.
12295          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
12296       if (aarch64_tls_size > 48)
12297         aarch64_tls_size = 48;
12298       break;
12299     default:
12300       gcc_unreachable ();
12301     }
12302
12303   return;
12304 }
12305
12306 /* Parse STRING looking for options in the format:
12307      string     :: option:string
12308      option     :: name=substring
12309      name       :: {a-z}
12310      substring  :: defined by option.  */
12311
12312 static void
12313 aarch64_parse_override_string (const char* input_string,
12314                                struct tune_params* tune)
12315 {
12316   const char separator = ':';
12317   size_t string_length = strlen (input_string) + 1;
12318   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
12319   char *string = string_root;
12320   strncpy (string, input_string, string_length);
12321   string[string_length - 1] = '\0';
12322
12323   char* ntoken = string;
12324
12325   while ((ntoken = strchr (string, separator)))
12326     {
12327       size_t token_length = ntoken - string;
12328       /* Make this substring look like a string.  */
12329       *ntoken = '\0';
12330       aarch64_parse_one_override_token (string, token_length, tune);
12331       string = ++ntoken;
12332     }
12333
12334   /* One last option to parse.  */
12335   aarch64_parse_one_override_token (string, strlen (string), tune);
12336   free (string_root);
12337 }
12338
12339
12340 static void
12341 aarch64_override_options_after_change_1 (struct gcc_options *opts)
12342 {
12343   if (accepted_branch_protection_string)
12344     {
12345       opts->x_aarch64_branch_protection_string
12346         = xstrdup (accepted_branch_protection_string);
12347     }
12348
12349   /* PR 70044: We have to be careful about being called multiple times for the
12350      same function.  This means all changes should be repeatable.  */
12351
12352   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
12353      Disable the frame pointer flag so the mid-end will not use a frame
12354      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
12355      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
12356      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
12357   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
12358   if (opts->x_flag_omit_frame_pointer == 0)
12359     opts->x_flag_omit_frame_pointer = 2;
12360
12361   /* If not optimizing for size, set the default
12362      alignment to what the target wants.  */
12363   if (!opts->x_optimize_size)
12364     {
12365       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
12366         opts->x_str_align_loops = aarch64_tune_params.loop_align;
12367       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
12368         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
12369       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
12370         opts->x_str_align_functions = aarch64_tune_params.function_align;
12371     }
12372
12373   /* We default to no pc-relative literal loads.  */
12374
12375   aarch64_pcrelative_literal_loads = false;
12376
12377   /* If -mpc-relative-literal-loads is set on the command line, this
12378      implies that the user asked for PC relative literal loads.  */
12379   if (opts->x_pcrelative_literal_loads == 1)
12380     aarch64_pcrelative_literal_loads = true;
12381
12382   /* In the tiny memory model it makes no sense to disallow PC relative
12383      literal pool loads.  */
12384   if (aarch64_cmodel == AARCH64_CMODEL_TINY
12385       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
12386     aarch64_pcrelative_literal_loads = true;
12387
12388   /* When enabling the lower precision Newton series for the square root, also
12389      enable it for the reciprocal square root, since the latter is an
12390      intermediary step for the former.  */
12391   if (flag_mlow_precision_sqrt)
12392     flag_mrecip_low_precision_sqrt = true;
12393 }
12394
12395 /* 'Unpack' up the internal tuning structs and update the options
12396     in OPTS.  The caller must have set up selected_tune and selected_arch
12397     as all the other target-specific codegen decisions are
12398     derived from them.  */
12399
12400 void
12401 aarch64_override_options_internal (struct gcc_options *opts)
12402 {
12403   aarch64_tune_flags = selected_tune->flags;
12404   aarch64_tune = selected_tune->sched_core;
12405   /* Make a copy of the tuning parameters attached to the core, which
12406      we may later overwrite.  */
12407   aarch64_tune_params = *(selected_tune->tune);
12408   aarch64_architecture_version = selected_arch->architecture_version;
12409
12410   if (opts->x_aarch64_override_tune_string)
12411     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
12412                                   &aarch64_tune_params);
12413
12414   /* This target defaults to strict volatile bitfields.  */
12415   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
12416     opts->x_flag_strict_volatile_bitfields = 1;
12417
12418   if (aarch64_stack_protector_guard == SSP_GLOBAL
12419       && opts->x_aarch64_stack_protector_guard_offset_str)
12420     {
12421       error ("incompatible options %<-mstack-protector-guard=global%> and "
12422              "%<-mstack-protector-guard-offset=%s%>",
12423              aarch64_stack_protector_guard_offset_str);
12424     }
12425
12426   if (aarch64_stack_protector_guard == SSP_SYSREG
12427       && !(opts->x_aarch64_stack_protector_guard_offset_str
12428            && opts->x_aarch64_stack_protector_guard_reg_str))
12429     {
12430       error ("both %<-mstack-protector-guard-offset%> and "
12431              "%<-mstack-protector-guard-reg%> must be used "
12432              "with %<-mstack-protector-guard=sysreg%>");
12433     }
12434
12435   if (opts->x_aarch64_stack_protector_guard_reg_str)
12436     {
12437       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
12438           error ("specify a system register with a small string length.");
12439     }
12440
12441   if (opts->x_aarch64_stack_protector_guard_offset_str)
12442     {
12443       char *end;
12444       const char *str = aarch64_stack_protector_guard_offset_str;
12445       errno = 0;
12446       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
12447       if (!*str || *end || errno)
12448         error ("%qs is not a valid offset in %qs", str,
12449                "-mstack-protector-guard-offset=");
12450       aarch64_stack_protector_guard_offset = offs;
12451     }
12452
12453   initialize_aarch64_code_model (opts);
12454   initialize_aarch64_tls_size (opts);
12455
12456   int queue_depth = 0;
12457   switch (aarch64_tune_params.autoprefetcher_model)
12458     {
12459       case tune_params::AUTOPREFETCHER_OFF:
12460         queue_depth = -1;
12461         break;
12462       case tune_params::AUTOPREFETCHER_WEAK:
12463         queue_depth = 0;
12464         break;
12465       case tune_params::AUTOPREFETCHER_STRONG:
12466         queue_depth = max_insn_queue_index + 1;
12467         break;
12468       default:
12469         gcc_unreachable ();
12470     }
12471
12472   /* We don't mind passing in global_options_set here as we don't use
12473      the *options_set structs anyway.  */
12474   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
12475                          queue_depth,
12476                          opts->x_param_values,
12477                          global_options_set.x_param_values);
12478
12479   /* Set up parameters to be used in prefetching algorithm.  Do not
12480      override the defaults unless we are tuning for a core we have
12481      researched values for.  */
12482   if (aarch64_tune_params.prefetch->num_slots > 0)
12483     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
12484                            aarch64_tune_params.prefetch->num_slots,
12485                            opts->x_param_values,
12486                            global_options_set.x_param_values);
12487   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
12488     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
12489                            aarch64_tune_params.prefetch->l1_cache_size,
12490                            opts->x_param_values,
12491                            global_options_set.x_param_values);
12492   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
12493     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
12494                            aarch64_tune_params.prefetch->l1_cache_line_size,
12495                            opts->x_param_values,
12496                            global_options_set.x_param_values);
12497   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
12498     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
12499                            aarch64_tune_params.prefetch->l2_cache_size,
12500                            opts->x_param_values,
12501                            global_options_set.x_param_values);
12502   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
12503     maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
12504                            0,
12505                            opts->x_param_values,
12506                            global_options_set.x_param_values);
12507   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
12508     maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
12509                            aarch64_tune_params.prefetch->minimum_stride,
12510                            opts->x_param_values,
12511                            global_options_set.x_param_values);
12512
12513   /* Use the alternative scheduling-pressure algorithm by default.  */
12514   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
12515                          opts->x_param_values,
12516                          global_options_set.x_param_values);
12517
12518   /* If the user hasn't changed it via configure then set the default to 64 KB
12519      for the backend.  */
12520   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
12521                          DEFAULT_STK_CLASH_GUARD_SIZE == 0
12522                            ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
12523                          opts->x_param_values,
12524                          global_options_set.x_param_values);
12525
12526   /* Validate the guard size.  */
12527   int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
12528
12529   /* Enforce that interval is the same size as size so the mid-end does the
12530      right thing.  */
12531   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
12532                          guard_size,
12533                          opts->x_param_values,
12534                          global_options_set.x_param_values);
12535
12536   /* The maybe_set calls won't update the value if the user has explicitly set
12537      one.  Which means we need to validate that probing interval and guard size
12538      are equal.  */
12539   int probe_interval
12540     = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
12541   if (guard_size != probe_interval)
12542     error ("stack clash guard size %<%d%> must be equal to probing interval "
12543            "%<%d%>", guard_size, probe_interval);
12544
12545   /* Enable sw prefetching at specified optimization level for
12546      CPUS that have prefetch.  Lower optimization level threshold by 1
12547      when profiling is enabled.  */
12548   if (opts->x_flag_prefetch_loop_arrays < 0
12549       && !opts->x_optimize_size
12550       && aarch64_tune_params.prefetch->default_opt_level >= 0
12551       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
12552     opts->x_flag_prefetch_loop_arrays = 1;
12553
12554   if (opts->x_aarch64_arch_string == NULL)
12555     opts->x_aarch64_arch_string = selected_arch->name;
12556   if (opts->x_aarch64_cpu_string == NULL)
12557     opts->x_aarch64_cpu_string = selected_cpu->name;
12558   if (opts->x_aarch64_tune_string == NULL)
12559     opts->x_aarch64_tune_string = selected_tune->name;
12560
12561   aarch64_override_options_after_change_1 (opts);
12562 }
12563
12564 /* Print a hint with a suggestion for a core or architecture name that
12565    most closely resembles what the user passed in STR.  ARCH is true if
12566    the user is asking for an architecture name.  ARCH is false if the user
12567    is asking for a core name.  */
12568
12569 static void
12570 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
12571 {
12572   auto_vec<const char *> candidates;
12573   const struct processor *entry = arch ? all_architectures : all_cores;
12574   for (; entry->name != NULL; entry++)
12575     candidates.safe_push (entry->name);
12576
12577 #ifdef HAVE_LOCAL_CPU_DETECT
12578   /* Add also "native" as possible value.  */
12579   if (arch)
12580     candidates.safe_push ("native");
12581 #endif
12582
12583   char *s;
12584   const char *hint = candidates_list_and_hint (str, s, candidates);
12585   if (hint)
12586     inform (input_location, "valid arguments are: %s;"
12587                              " did you mean %qs?", s, hint);
12588   else
12589     inform (input_location, "valid arguments are: %s", s);
12590
12591   XDELETEVEC (s);
12592 }
12593
12594 /* Print a hint with a suggestion for a core name that most closely resembles
12595    what the user passed in STR.  */
12596
12597 inline static void
12598 aarch64_print_hint_for_core (const char *str)
12599 {
12600   aarch64_print_hint_for_core_or_arch (str, false);
12601 }
12602
12603 /* Print a hint with a suggestion for an architecture name that most closely
12604    resembles what the user passed in STR.  */
12605
12606 inline static void
12607 aarch64_print_hint_for_arch (const char *str)
12608 {
12609   aarch64_print_hint_for_core_or_arch (str, true);
12610 }
12611
12612
12613 /* Print a hint with a suggestion for an extension name
12614    that most closely resembles what the user passed in STR.  */
12615
12616 void
12617 aarch64_print_hint_for_extensions (const std::string &str)
12618 {
12619   auto_vec<const char *> candidates;
12620   aarch64_get_all_extension_candidates (&candidates);
12621   char *s;
12622   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
12623   if (hint)
12624     inform (input_location, "valid arguments are: %s;"
12625                              " did you mean %qs?", s, hint);
12626   else
12627     inform (input_location, "valid arguments are: %s;", s);
12628
12629   XDELETEVEC (s);
12630 }
12631
12632 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
12633    specified in STR and throw errors if appropriate.  Put the results if
12634    they are valid in RES and ISA_FLAGS.  Return whether the option is
12635    valid.  */
12636
12637 static bool
12638 aarch64_validate_mcpu (const char *str, const struct processor **res,
12639                        uint64_t *isa_flags)
12640 {
12641   std::string invalid_extension;
12642   enum aarch64_parse_opt_result parse_res
12643     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
12644
12645   if (parse_res == AARCH64_PARSE_OK)
12646     return true;
12647
12648   switch (parse_res)
12649     {
12650       case AARCH64_PARSE_MISSING_ARG:
12651         error ("missing cpu name in %<-mcpu=%s%>", str);
12652         break;
12653       case AARCH64_PARSE_INVALID_ARG:
12654         error ("unknown value %qs for %<-mcpu%>", str);
12655         aarch64_print_hint_for_core (str);
12656         break;
12657       case AARCH64_PARSE_INVALID_FEATURE:
12658         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
12659                invalid_extension.c_str (), str);
12660         aarch64_print_hint_for_extensions (invalid_extension);
12661         break;
12662       default:
12663         gcc_unreachable ();
12664     }
12665
12666   return false;
12667 }
12668
12669 /* Parses CONST_STR for branch protection features specified in
12670    aarch64_branch_protect_types, and set any global variables required.  Returns
12671    the parsing result and assigns LAST_STR to the last processed token from
12672    CONST_STR so that it can be used for error reporting.  */
12673
12674 static enum
12675 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
12676                                                           char** last_str)
12677 {
12678   char *str_root = xstrdup (const_str);
12679   char* token_save = NULL;
12680   char *str = strtok_r (str_root, "+", &token_save);
12681   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
12682   if (!str)
12683     res = AARCH64_PARSE_MISSING_ARG;
12684   else
12685     {
12686       char *next_str = strtok_r (NULL, "+", &token_save);
12687       /* Reset the branch protection features to their defaults.  */
12688       aarch64_handle_no_branch_protection (NULL, NULL);
12689
12690       while (str && res == AARCH64_PARSE_OK)
12691         {
12692           const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
12693           bool found = false;
12694           /* Search for this type.  */
12695           while (type && type->name && !found && res == AARCH64_PARSE_OK)
12696             {
12697               if (strcmp (str, type->name) == 0)
12698                 {
12699                   found = true;
12700                   res = type->handler (str, next_str);
12701                   str = next_str;
12702                   next_str = strtok_r (NULL, "+", &token_save);
12703                 }
12704               else
12705                 type++;
12706             }
12707           if (found && res == AARCH64_PARSE_OK)
12708             {
12709               bool found_subtype = true;
12710               /* Loop through each token until we find one that isn't a
12711                  subtype.  */
12712               while (found_subtype)
12713                 {
12714                   found_subtype = false;
12715                   const aarch64_branch_protect_type *subtype = type->subtypes;
12716                   /* Search for the subtype.  */
12717                   while (str && subtype && subtype->name && !found_subtype
12718                           && res == AARCH64_PARSE_OK)
12719                     {
12720                       if (strcmp (str, subtype->name) == 0)
12721                         {
12722                           found_subtype = true;
12723                           res = subtype->handler (str, next_str);
12724                           str = next_str;
12725                           next_str = strtok_r (NULL, "+", &token_save);
12726                         }
12727                       else
12728                         subtype++;
12729                     }
12730                 }
12731             }
12732           else if (!found)
12733             res = AARCH64_PARSE_INVALID_ARG;
12734         }
12735     }
12736   /* Copy the last processed token into the argument to pass it back.
12737     Used by option and attribute validation to print the offending token.  */
12738   if (last_str)
12739     {
12740       if (str) strcpy (*last_str, str);
12741       else *last_str = NULL;
12742     }
12743   if (res == AARCH64_PARSE_OK)
12744     {
12745       /* If needed, alloc the accepted string then copy in const_str.
12746         Used by override_option_after_change_1.  */
12747       if (!accepted_branch_protection_string)
12748         accepted_branch_protection_string = (char *) xmalloc (
12749                                                       BRANCH_PROTECT_STR_MAX
12750                                                         + 1);
12751       strncpy (accepted_branch_protection_string, const_str,
12752                 BRANCH_PROTECT_STR_MAX + 1);
12753       /* Forcibly null-terminate.  */
12754       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
12755     }
12756   return res;
12757 }
12758
12759 static bool
12760 aarch64_validate_mbranch_protection (const char *const_str)
12761 {
12762   char *str = (char *) xmalloc (strlen (const_str));
12763   enum aarch64_parse_opt_result res =
12764     aarch64_parse_branch_protection (const_str, &str);
12765   if (res == AARCH64_PARSE_INVALID_ARG)
12766     error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
12767   else if (res == AARCH64_PARSE_MISSING_ARG)
12768     error ("missing argument for %<-mbranch-protection=%>");
12769   free (str);
12770   return res == AARCH64_PARSE_OK;
12771 }
12772
12773 /* Validate a command-line -march option.  Parse the arch and extensions
12774    (if any) specified in STR and throw errors if appropriate.  Put the
12775    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
12776    option is valid.  */
12777
12778 static bool
12779 aarch64_validate_march (const char *str, const struct processor **res,
12780                          uint64_t *isa_flags)
12781 {
12782   std::string invalid_extension;
12783   enum aarch64_parse_opt_result parse_res
12784     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
12785
12786   if (parse_res == AARCH64_PARSE_OK)
12787     return true;
12788
12789   switch (parse_res)
12790     {
12791       case AARCH64_PARSE_MISSING_ARG:
12792         error ("missing arch name in %<-march=%s%>", str);
12793         break;
12794       case AARCH64_PARSE_INVALID_ARG:
12795         error ("unknown value %qs for %<-march%>", str);
12796         aarch64_print_hint_for_arch (str);
12797         break;
12798       case AARCH64_PARSE_INVALID_FEATURE:
12799         error ("invalid feature modifier %qs in %<-march=%s%>",
12800                invalid_extension.c_str (), str);
12801         aarch64_print_hint_for_extensions (invalid_extension);
12802         break;
12803       default:
12804         gcc_unreachable ();
12805     }
12806
12807   return false;
12808 }
12809
12810 /* Validate a command-line -mtune option.  Parse the cpu
12811    specified in STR and throw errors if appropriate.  Put the
12812    result, if it is valid, in RES.  Return whether the option is
12813    valid.  */
12814
12815 static bool
12816 aarch64_validate_mtune (const char *str, const struct processor **res)
12817 {
12818   enum aarch64_parse_opt_result parse_res
12819     = aarch64_parse_tune (str, res);
12820
12821   if (parse_res == AARCH64_PARSE_OK)
12822     return true;
12823
12824   switch (parse_res)
12825     {
12826       case AARCH64_PARSE_MISSING_ARG:
12827         error ("missing cpu name in %<-mtune=%s%>", str);
12828         break;
12829       case AARCH64_PARSE_INVALID_ARG:
12830         error ("unknown value %qs for %<-mtune%>", str);
12831         aarch64_print_hint_for_core (str);
12832         break;
12833       default:
12834         gcc_unreachable ();
12835     }
12836   return false;
12837 }
12838
12839 /* Return the CPU corresponding to the enum CPU.
12840    If it doesn't specify a cpu, return the default.  */
12841
12842 static const struct processor *
12843 aarch64_get_tune_cpu (enum aarch64_processor cpu)
12844 {
12845   if (cpu != aarch64_none)
12846     return &all_cores[cpu];
12847
12848   /* The & 0x3f is to extract the bottom 6 bits that encode the
12849      default cpu as selected by the --with-cpu GCC configure option
12850      in config.gcc.
12851      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
12852      flags mechanism should be reworked to make it more sane.  */
12853   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12854 }
12855
12856 /* Return the architecture corresponding to the enum ARCH.
12857    If it doesn't specify a valid architecture, return the default.  */
12858
12859 static const struct processor *
12860 aarch64_get_arch (enum aarch64_arch arch)
12861 {
12862   if (arch != aarch64_no_arch)
12863     return &all_architectures[arch];
12864
12865   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12866
12867   return &all_architectures[cpu->arch];
12868 }
12869
12870 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
12871
12872 static poly_uint16
12873 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
12874 {
12875   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
12876      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
12877      deciding which .md file patterns to use and when deciding whether
12878      something is a legitimate address or constant.  */
12879   if (value == SVE_SCALABLE || value == SVE_128)
12880     return poly_uint16 (2, 2);
12881   else
12882     return (int) value / 64;
12883 }
12884
12885 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
12886    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
12887    tuning structs.  In particular it must set selected_tune and
12888    aarch64_isa_flags that define the available ISA features and tuning
12889    decisions.  It must also set selected_arch as this will be used to
12890    output the .arch asm tags for each function.  */
12891
12892 static void
12893 aarch64_override_options (void)
12894 {
12895   uint64_t cpu_isa = 0;
12896   uint64_t arch_isa = 0;
12897   aarch64_isa_flags = 0;
12898
12899   bool valid_cpu = true;
12900   bool valid_tune = true;
12901   bool valid_arch = true;
12902
12903   selected_cpu = NULL;
12904   selected_arch = NULL;
12905   selected_tune = NULL;
12906
12907   if (aarch64_branch_protection_string)
12908     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
12909
12910   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
12911      If either of -march or -mtune is given, they override their
12912      respective component of -mcpu.  */
12913   if (aarch64_cpu_string)
12914     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
12915                                         &cpu_isa);
12916
12917   if (aarch64_arch_string)
12918     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
12919                                           &arch_isa);
12920
12921   if (aarch64_tune_string)
12922     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
12923
12924 #ifdef SUBTARGET_OVERRIDE_OPTIONS
12925   SUBTARGET_OVERRIDE_OPTIONS;
12926 #endif
12927
12928   /* If the user did not specify a processor, choose the default
12929      one for them.  This will be the CPU set during configuration using
12930      --with-cpu, otherwise it is "generic".  */
12931   if (!selected_cpu)
12932     {
12933       if (selected_arch)
12934         {
12935           selected_cpu = &all_cores[selected_arch->ident];
12936           aarch64_isa_flags = arch_isa;
12937           explicit_arch = selected_arch->arch;
12938         }
12939       else
12940         {
12941           /* Get default configure-time CPU.  */
12942           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
12943           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
12944         }
12945
12946       if (selected_tune)
12947         explicit_tune_core = selected_tune->ident;
12948     }
12949   /* If both -mcpu and -march are specified check that they are architecturally
12950      compatible, warn if they're not and prefer the -march ISA flags.  */
12951   else if (selected_arch)
12952     {
12953       if (selected_arch->arch != selected_cpu->arch)
12954         {
12955           warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
12956                        all_architectures[selected_cpu->arch].name,
12957                        selected_arch->name);
12958         }
12959       aarch64_isa_flags = arch_isa;
12960       explicit_arch = selected_arch->arch;
12961       explicit_tune_core = selected_tune ? selected_tune->ident
12962                                           : selected_cpu->ident;
12963     }
12964   else
12965     {
12966       /* -mcpu but no -march.  */
12967       aarch64_isa_flags = cpu_isa;
12968       explicit_tune_core = selected_tune ? selected_tune->ident
12969                                           : selected_cpu->ident;
12970       gcc_assert (selected_cpu);
12971       selected_arch = &all_architectures[selected_cpu->arch];
12972       explicit_arch = selected_arch->arch;
12973     }
12974
12975   /* Set the arch as well as we will need it when outputing
12976      the .arch directive in assembly.  */
12977   if (!selected_arch)
12978     {
12979       gcc_assert (selected_cpu);
12980       selected_arch = &all_architectures[selected_cpu->arch];
12981     }
12982
12983   if (!selected_tune)
12984     selected_tune = selected_cpu;
12985
12986   if (aarch64_enable_bti == 2)
12987     {
12988 #ifdef TARGET_ENABLE_BTI
12989       aarch64_enable_bti = 1;
12990 #else
12991       aarch64_enable_bti = 0;
12992 #endif
12993     }
12994
12995   /* Return address signing is currently not supported for ILP32 targets.  For
12996      LP64 targets use the configured option in the absence of a command-line
12997      option for -mbranch-protection.  */
12998   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
12999     {
13000 #ifdef TARGET_ENABLE_PAC_RET
13001       aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
13002 #else
13003       aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
13004 #endif
13005     }
13006
13007 #ifndef HAVE_AS_MABI_OPTION
13008   /* The compiler may have been configured with 2.23.* binutils, which does
13009      not have support for ILP32.  */
13010   if (TARGET_ILP32)
13011     error ("assembler does not support %<-mabi=ilp32%>");
13012 #endif
13013
13014   /* Convert -msve-vector-bits to a VG count.  */
13015   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
13016
13017   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
13018     sorry ("return address signing is only supported for %<-mabi=lp64%>");
13019
13020   /* Make sure we properly set up the explicit options.  */
13021   if ((aarch64_cpu_string && valid_cpu)
13022        || (aarch64_tune_string && valid_tune))
13023     gcc_assert (explicit_tune_core != aarch64_none);
13024
13025   if ((aarch64_cpu_string && valid_cpu)
13026        || (aarch64_arch_string && valid_arch))
13027     gcc_assert (explicit_arch != aarch64_no_arch);
13028
13029   /* The pass to insert speculation tracking runs before
13030      shrink-wrapping and the latter does not know how to update the
13031      tracking status.  So disable it in this case.  */
13032   if (aarch64_track_speculation)
13033     flag_shrink_wrap = 0;
13034
13035   aarch64_override_options_internal (&global_options);
13036
13037   /* Save these options as the default ones in case we push and pop them later
13038      while processing functions with potential target attributes.  */
13039   target_option_default_node = target_option_current_node
13040       = build_target_option_node (&global_options);
13041 }
13042
13043 /* Implement targetm.override_options_after_change.  */
13044
13045 static void
13046 aarch64_override_options_after_change (void)
13047 {
13048   aarch64_override_options_after_change_1 (&global_options);
13049 }
13050
13051 static struct machine_function *
13052 aarch64_init_machine_status (void)
13053 {
13054   struct machine_function *machine;
13055   machine = ggc_cleared_alloc<machine_function> ();
13056   return machine;
13057 }
13058
13059 void
13060 aarch64_init_expanders (void)
13061 {
13062   init_machine_status = aarch64_init_machine_status;
13063 }
13064
13065 /* A checking mechanism for the implementation of the various code models.  */
13066 static void
13067 initialize_aarch64_code_model (struct gcc_options *opts)
13068 {
13069    if (opts->x_flag_pic)
13070      {
13071        switch (opts->x_aarch64_cmodel_var)
13072          {
13073          case AARCH64_CMODEL_TINY:
13074            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
13075            break;
13076          case AARCH64_CMODEL_SMALL:
13077 #ifdef HAVE_AS_SMALL_PIC_RELOCS
13078            aarch64_cmodel = (flag_pic == 2
13079                              ? AARCH64_CMODEL_SMALL_PIC
13080                              : AARCH64_CMODEL_SMALL_SPIC);
13081 #else
13082            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
13083 #endif
13084            break;
13085          case AARCH64_CMODEL_LARGE:
13086            sorry ("code model %qs with %<-f%s%>", "large",
13087                   opts->x_flag_pic > 1 ? "PIC" : "pic");
13088            break;
13089          default:
13090            gcc_unreachable ();
13091          }
13092      }
13093    else
13094      aarch64_cmodel = opts->x_aarch64_cmodel_var;
13095 }
13096
13097 /* Implement TARGET_OPTION_SAVE.  */
13098
13099 static void
13100 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
13101 {
13102   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
13103   ptr->x_aarch64_branch_protection_string
13104     = opts->x_aarch64_branch_protection_string;
13105 }
13106
13107 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
13108    using the information saved in PTR.  */
13109
13110 static void
13111 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
13112 {
13113   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
13114   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
13115   opts->x_explicit_arch = ptr->x_explicit_arch;
13116   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
13117   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
13118   opts->x_aarch64_branch_protection_string
13119     = ptr->x_aarch64_branch_protection_string;
13120   if (opts->x_aarch64_branch_protection_string)
13121     {
13122       aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
13123                                         NULL);
13124     }
13125
13126   aarch64_override_options_internal (opts);
13127 }
13128
13129 /* Implement TARGET_OPTION_PRINT.  */
13130
13131 static void
13132 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
13133 {
13134   const struct processor *cpu
13135     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
13136   uint64_t isa_flags = ptr->x_aarch64_isa_flags;
13137   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
13138   std::string extension
13139     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
13140
13141   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
13142   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
13143            arch->name, extension.c_str ());
13144 }
13145
13146 static GTY(()) tree aarch64_previous_fndecl;
13147
13148 void
13149 aarch64_reset_previous_fndecl (void)
13150 {
13151   aarch64_previous_fndecl = NULL;
13152 }
13153
13154 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
13155    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
13156    make sure optab availability predicates are recomputed when necessary.  */
13157
13158 void
13159 aarch64_save_restore_target_globals (tree new_tree)
13160 {
13161   if (TREE_TARGET_GLOBALS (new_tree))
13162     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
13163   else if (new_tree == target_option_default_node)
13164     restore_target_globals (&default_target_globals);
13165   else
13166     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
13167 }
13168
13169 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
13170    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
13171    of the function, if such exists.  This function may be called multiple
13172    times on a single function so use aarch64_previous_fndecl to avoid
13173    setting up identical state.  */
13174
13175 static void
13176 aarch64_set_current_function (tree fndecl)
13177 {
13178   if (!fndecl || fndecl == aarch64_previous_fndecl)
13179     return;
13180
13181   tree old_tree = (aarch64_previous_fndecl
13182                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
13183                    : NULL_TREE);
13184
13185   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13186
13187   /* If current function has no attributes but the previous one did,
13188      use the default node.  */
13189   if (!new_tree && old_tree)
13190     new_tree = target_option_default_node;
13191
13192   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
13193      the default have been handled by aarch64_save_restore_target_globals from
13194      aarch64_pragma_target_parse.  */
13195   if (old_tree == new_tree)
13196     return;
13197
13198   aarch64_previous_fndecl = fndecl;
13199
13200   /* First set the target options.  */
13201   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
13202
13203   aarch64_save_restore_target_globals (new_tree);
13204 }
13205
13206 /* Enum describing the various ways we can handle attributes.
13207    In many cases we can reuse the generic option handling machinery.  */
13208
13209 enum aarch64_attr_opt_type
13210 {
13211   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
13212   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
13213   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
13214   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
13215 };
13216
13217 /* All the information needed to handle a target attribute.
13218    NAME is the name of the attribute.
13219    ATTR_TYPE specifies the type of behavior of the attribute as described
13220    in the definition of enum aarch64_attr_opt_type.
13221    ALLOW_NEG is true if the attribute supports a "no-" form.
13222    HANDLER is the function that takes the attribute string as an argument
13223    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
13224    OPT_NUM is the enum specifying the option that the attribute modifies.
13225    This is needed for attributes that mirror the behavior of a command-line
13226    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
13227    aarch64_attr_enum.  */
13228
13229 struct aarch64_attribute_info
13230 {
13231   const char *name;
13232   enum aarch64_attr_opt_type attr_type;
13233   bool allow_neg;
13234   bool (*handler) (const char *);
13235   enum opt_code opt_num;
13236 };
13237
13238 /* Handle the ARCH_STR argument to the arch= target attribute.  */
13239
13240 static bool
13241 aarch64_handle_attr_arch (const char *str)
13242 {
13243   const struct processor *tmp_arch = NULL;
13244   std::string invalid_extension;
13245   enum aarch64_parse_opt_result parse_res
13246     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
13247
13248   if (parse_res == AARCH64_PARSE_OK)
13249     {
13250       gcc_assert (tmp_arch);
13251       selected_arch = tmp_arch;
13252       explicit_arch = selected_arch->arch;
13253       return true;
13254     }
13255
13256   switch (parse_res)
13257     {
13258       case AARCH64_PARSE_MISSING_ARG:
13259         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
13260         break;
13261       case AARCH64_PARSE_INVALID_ARG:
13262         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
13263         aarch64_print_hint_for_arch (str);
13264         break;
13265       case AARCH64_PARSE_INVALID_FEATURE:
13266         error ("invalid feature modifier %s of value (\"%s\") in "
13267                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13268         aarch64_print_hint_for_extensions (invalid_extension);
13269         break;
13270       default:
13271         gcc_unreachable ();
13272     }
13273
13274   return false;
13275 }
13276
13277 /* Handle the argument CPU_STR to the cpu= target attribute.  */
13278
13279 static bool
13280 aarch64_handle_attr_cpu (const char *str)
13281 {
13282   const struct processor *tmp_cpu = NULL;
13283   std::string invalid_extension;
13284   enum aarch64_parse_opt_result parse_res
13285     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
13286
13287   if (parse_res == AARCH64_PARSE_OK)
13288     {
13289       gcc_assert (tmp_cpu);
13290       selected_tune = tmp_cpu;
13291       explicit_tune_core = selected_tune->ident;
13292
13293       selected_arch = &all_architectures[tmp_cpu->arch];
13294       explicit_arch = selected_arch->arch;
13295       return true;
13296     }
13297
13298   switch (parse_res)
13299     {
13300       case AARCH64_PARSE_MISSING_ARG:
13301         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
13302         break;
13303       case AARCH64_PARSE_INVALID_ARG:
13304         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
13305         aarch64_print_hint_for_core (str);
13306         break;
13307       case AARCH64_PARSE_INVALID_FEATURE:
13308         error ("invalid feature modifier %s of value (\"%s\") in "
13309                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13310         aarch64_print_hint_for_extensions (invalid_extension);
13311         break;
13312       default:
13313         gcc_unreachable ();
13314     }
13315
13316   return false;
13317 }
13318
13319 /* Handle the argument STR to the branch-protection= attribute.  */
13320
13321  static bool
13322  aarch64_handle_attr_branch_protection (const char* str)
13323  {
13324   char *err_str = (char *) xmalloc (strlen (str));
13325   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
13326                                                                       &err_str);
13327   bool success = false;
13328   switch (res)
13329     {
13330      case AARCH64_PARSE_MISSING_ARG:
13331        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
13332               " attribute");
13333        break;
13334      case AARCH64_PARSE_INVALID_ARG:
13335        error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
13336               "=\")%> pragma or attribute", err_str);
13337        break;
13338      case AARCH64_PARSE_OK:
13339        success = true;
13340       /* Fall through.  */
13341      case AARCH64_PARSE_INVALID_FEATURE:
13342        break;
13343      default:
13344        gcc_unreachable ();
13345     }
13346   free (err_str);
13347   return success;
13348  }
13349
13350 /* Handle the argument STR to the tune= target attribute.  */
13351
13352 static bool
13353 aarch64_handle_attr_tune (const char *str)
13354 {
13355   const struct processor *tmp_tune = NULL;
13356   enum aarch64_parse_opt_result parse_res
13357     = aarch64_parse_tune (str, &tmp_tune);
13358
13359   if (parse_res == AARCH64_PARSE_OK)
13360     {
13361       gcc_assert (tmp_tune);
13362       selected_tune = tmp_tune;
13363       explicit_tune_core = selected_tune->ident;
13364       return true;
13365     }
13366
13367   switch (parse_res)
13368     {
13369       case AARCH64_PARSE_INVALID_ARG:
13370         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
13371         aarch64_print_hint_for_core (str);
13372         break;
13373       default:
13374         gcc_unreachable ();
13375     }
13376
13377   return false;
13378 }
13379
13380 /* Parse an architecture extensions target attribute string specified in STR.
13381    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
13382    if successful.  Update aarch64_isa_flags to reflect the ISA features
13383    modified.  */
13384
13385 static bool
13386 aarch64_handle_attr_isa_flags (char *str)
13387 {
13388   enum aarch64_parse_opt_result parse_res;
13389   uint64_t isa_flags = aarch64_isa_flags;
13390
13391   /* We allow "+nothing" in the beginning to clear out all architectural
13392      features if the user wants to handpick specific features.  */
13393   if (strncmp ("+nothing", str, 8) == 0)
13394     {
13395       isa_flags = 0;
13396       str += 8;
13397     }
13398
13399   std::string invalid_extension;
13400   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
13401
13402   if (parse_res == AARCH64_PARSE_OK)
13403     {
13404       aarch64_isa_flags = isa_flags;
13405       return true;
13406     }
13407
13408   switch (parse_res)
13409     {
13410       case AARCH64_PARSE_MISSING_ARG:
13411         error ("missing value in %<target()%> pragma or attribute");
13412         break;
13413
13414       case AARCH64_PARSE_INVALID_FEATURE:
13415         error ("invalid feature modifier %s of value (\"%s\") in "
13416                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13417         break;
13418
13419       default:
13420         gcc_unreachable ();
13421     }
13422
13423  return false;
13424 }
13425
13426 /* The target attributes that we support.  On top of these we also support just
13427    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
13428    handled explicitly in aarch64_process_one_target_attr.  */
13429
13430 static const struct aarch64_attribute_info aarch64_attributes[] =
13431 {
13432   { "general-regs-only", aarch64_attr_mask, false, NULL,
13433      OPT_mgeneral_regs_only },
13434   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
13435      OPT_mfix_cortex_a53_835769 },
13436   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
13437      OPT_mfix_cortex_a53_843419 },
13438   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
13439   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
13440   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
13441      OPT_momit_leaf_frame_pointer },
13442   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
13443   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
13444      OPT_march_ },
13445   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
13446   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
13447      OPT_mtune_ },
13448   { "branch-protection", aarch64_attr_custom, false,
13449      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
13450   { "sign-return-address", aarch64_attr_enum, false, NULL,
13451      OPT_msign_return_address_ },
13452   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
13453 };
13454
13455 /* Parse ARG_STR which contains the definition of one target attribute.
13456    Show appropriate errors if any or return true if the attribute is valid.  */
13457
13458 static bool
13459 aarch64_process_one_target_attr (char *arg_str)
13460 {
13461   bool invert = false;
13462
13463   size_t len = strlen (arg_str);
13464
13465   if (len == 0)
13466     {
13467       error ("malformed %<target()%> pragma or attribute");
13468       return false;
13469     }
13470
13471   char *str_to_check = (char *) alloca (len + 1);
13472   strcpy (str_to_check, arg_str);
13473
13474   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
13475      It is easier to detect and handle it explicitly here rather than going
13476      through the machinery for the rest of the target attributes in this
13477      function.  */
13478   if (*str_to_check == '+')
13479     return aarch64_handle_attr_isa_flags (str_to_check);
13480
13481   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
13482     {
13483       invert = true;
13484       str_to_check += 3;
13485     }
13486   char *arg = strchr (str_to_check, '=');
13487
13488   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
13489      and point ARG to "foo".  */
13490   if (arg)
13491     {
13492       *arg = '\0';
13493       arg++;
13494     }
13495   const struct aarch64_attribute_info *p_attr;
13496   bool found = false;
13497   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
13498     {
13499       /* If the names don't match up, or the user has given an argument
13500          to an attribute that doesn't accept one, or didn't give an argument
13501          to an attribute that expects one, fail to match.  */
13502       if (strcmp (str_to_check, p_attr->name) != 0)
13503         continue;
13504
13505       found = true;
13506       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
13507                               || p_attr->attr_type == aarch64_attr_enum;
13508
13509       if (attr_need_arg_p ^ (arg != NULL))
13510         {
13511           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
13512           return false;
13513         }
13514
13515       /* If the name matches but the attribute does not allow "no-" versions
13516          then we can't match.  */
13517       if (invert && !p_attr->allow_neg)
13518         {
13519           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
13520           return false;
13521         }
13522
13523       switch (p_attr->attr_type)
13524         {
13525         /* Has a custom handler registered.
13526            For example, cpu=, arch=, tune=.  */
13527           case aarch64_attr_custom:
13528             gcc_assert (p_attr->handler);
13529             if (!p_attr->handler (arg))
13530               return false;
13531             break;
13532
13533           /* Either set or unset a boolean option.  */
13534           case aarch64_attr_bool:
13535             {
13536               struct cl_decoded_option decoded;
13537
13538               generate_option (p_attr->opt_num, NULL, !invert,
13539                                CL_TARGET, &decoded);
13540               aarch64_handle_option (&global_options, &global_options_set,
13541                                       &decoded, input_location);
13542               break;
13543             }
13544           /* Set or unset a bit in the target_flags.  aarch64_handle_option
13545              should know what mask to apply given the option number.  */
13546           case aarch64_attr_mask:
13547             {
13548               struct cl_decoded_option decoded;
13549               /* We only need to specify the option number.
13550                  aarch64_handle_option will know which mask to apply.  */
13551               decoded.opt_index = p_attr->opt_num;
13552               decoded.value = !invert;
13553               aarch64_handle_option (&global_options, &global_options_set,
13554                                       &decoded, input_location);
13555               break;
13556             }
13557           /* Use the option setting machinery to set an option to an enum.  */
13558           case aarch64_attr_enum:
13559             {
13560               gcc_assert (arg);
13561               bool valid;
13562               int value;
13563               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
13564                                               &value, CL_TARGET);
13565               if (valid)
13566                 {
13567                   set_option (&global_options, NULL, p_attr->opt_num, value,
13568                               NULL, DK_UNSPECIFIED, input_location,
13569                               global_dc);
13570                 }
13571               else
13572                 {
13573                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
13574                 }
13575               break;
13576             }
13577           default:
13578             gcc_unreachable ();
13579         }
13580     }
13581
13582   /* If we reached here we either have found an attribute and validated
13583      it or didn't match any.  If we matched an attribute but its arguments
13584      were malformed we will have returned false already.  */
13585   return found;
13586 }
13587
13588 /* Count how many times the character C appears in
13589    NULL-terminated string STR.  */
13590
13591 static unsigned int
13592 num_occurences_in_str (char c, char *str)
13593 {
13594   unsigned int res = 0;
13595   while (*str != '\0')
13596     {
13597       if (*str == c)
13598         res++;
13599
13600       str++;
13601     }
13602
13603   return res;
13604 }
13605
13606 /* Parse the tree in ARGS that contains the target attribute information
13607    and update the global target options space.  */
13608
13609 bool
13610 aarch64_process_target_attr (tree args)
13611 {
13612   if (TREE_CODE (args) == TREE_LIST)
13613     {
13614       do
13615         {
13616           tree head = TREE_VALUE (args);
13617           if (head)
13618             {
13619               if (!aarch64_process_target_attr (head))
13620                 return false;
13621             }
13622           args = TREE_CHAIN (args);
13623         } while (args);
13624
13625       return true;
13626     }
13627
13628   if (TREE_CODE (args) != STRING_CST)
13629     {
13630       error ("attribute %<target%> argument not a string");
13631       return false;
13632     }
13633
13634   size_t len = strlen (TREE_STRING_POINTER (args));
13635   char *str_to_check = (char *) alloca (len + 1);
13636   strcpy (str_to_check, TREE_STRING_POINTER (args));
13637
13638   if (len == 0)
13639     {
13640       error ("malformed %<target()%> pragma or attribute");
13641       return false;
13642     }
13643
13644   /* Used to catch empty spaces between commas i.e.
13645      attribute ((target ("attr1,,attr2"))).  */
13646   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
13647
13648   /* Handle multiple target attributes separated by ','.  */
13649   char *token = strtok_r (str_to_check, ",", &str_to_check);
13650
13651   unsigned int num_attrs = 0;
13652   while (token)
13653     {
13654       num_attrs++;
13655       if (!aarch64_process_one_target_attr (token))
13656         {
13657           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
13658           return false;
13659         }
13660
13661       token = strtok_r (NULL, ",", &str_to_check);
13662     }
13663
13664   if (num_attrs != num_commas + 1)
13665     {
13666       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
13667       return false;
13668     }
13669
13670   return true;
13671 }
13672
13673 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
13674    process attribute ((target ("..."))).  */
13675
13676 static bool
13677 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
13678 {
13679   struct cl_target_option cur_target;
13680   bool ret;
13681   tree old_optimize;
13682   tree new_target, new_optimize;
13683   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13684
13685   /* If what we're processing is the current pragma string then the
13686      target option node is already stored in target_option_current_node
13687      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
13688      having to re-parse the string.  This is especially useful to keep
13689      arm_neon.h compile times down since that header contains a lot
13690      of intrinsics enclosed in pragmas.  */
13691   if (!existing_target && args == current_target_pragma)
13692     {
13693       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
13694       return true;
13695     }
13696   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13697
13698   old_optimize = build_optimization_node (&global_options);
13699   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13700
13701   /* If the function changed the optimization levels as well as setting
13702      target options, start with the optimizations specified.  */
13703   if (func_optimize && func_optimize != old_optimize)
13704     cl_optimization_restore (&global_options,
13705                              TREE_OPTIMIZATION (func_optimize));
13706
13707   /* Save the current target options to restore at the end.  */
13708   cl_target_option_save (&cur_target, &global_options);
13709
13710   /* If fndecl already has some target attributes applied to it, unpack
13711      them so that we add this attribute on top of them, rather than
13712      overwriting them.  */
13713   if (existing_target)
13714     {
13715       struct cl_target_option *existing_options
13716         = TREE_TARGET_OPTION (existing_target);
13717
13718       if (existing_options)
13719         cl_target_option_restore (&global_options, existing_options);
13720     }
13721   else
13722     cl_target_option_restore (&global_options,
13723                         TREE_TARGET_OPTION (target_option_current_node));
13724
13725   ret = aarch64_process_target_attr (args);
13726
13727   /* Set up any additional state.  */
13728   if (ret)
13729     {
13730       aarch64_override_options_internal (&global_options);
13731       /* Initialize SIMD builtins if we haven't already.
13732          Set current_target_pragma to NULL for the duration so that
13733          the builtin initialization code doesn't try to tag the functions
13734          being built with the attributes specified by any current pragma, thus
13735          going into an infinite recursion.  */
13736       if (TARGET_SIMD)
13737         {
13738           tree saved_current_target_pragma = current_target_pragma;
13739           current_target_pragma = NULL;
13740           aarch64_init_simd_builtins ();
13741           current_target_pragma = saved_current_target_pragma;
13742         }
13743       new_target = build_target_option_node (&global_options);
13744     }
13745   else
13746     new_target = NULL;
13747
13748   new_optimize = build_optimization_node (&global_options);
13749
13750   if (fndecl && ret)
13751     {
13752       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
13753
13754       if (old_optimize != new_optimize)
13755         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
13756     }
13757
13758   cl_target_option_restore (&global_options, &cur_target);
13759
13760   if (old_optimize != new_optimize)
13761     cl_optimization_restore (&global_options,
13762                              TREE_OPTIMIZATION (old_optimize));
13763   return ret;
13764 }
13765
13766 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
13767    tri-bool options (yes, no, don't care) and the default value is
13768    DEF, determine whether to reject inlining.  */
13769
13770 static bool
13771 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
13772                                      int dont_care, int def)
13773 {
13774   /* If the callee doesn't care, always allow inlining.  */
13775   if (callee == dont_care)
13776     return true;
13777
13778   /* If the caller doesn't care, always allow inlining.  */
13779   if (caller == dont_care)
13780     return true;
13781
13782   /* Otherwise, allow inlining if either the callee and caller values
13783      agree, or if the callee is using the default value.  */
13784   return (callee == caller || callee == def);
13785 }
13786
13787 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
13788    to inline CALLEE into CALLER based on target-specific info.
13789    Make sure that the caller and callee have compatible architectural
13790    features.  Then go through the other possible target attributes
13791    and see if they can block inlining.  Try not to reject always_inline
13792    callees unless they are incompatible architecturally.  */
13793
13794 static bool
13795 aarch64_can_inline_p (tree caller, tree callee)
13796 {
13797   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
13798   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
13799
13800   struct cl_target_option *caller_opts
13801         = TREE_TARGET_OPTION (caller_tree ? caller_tree
13802                                            : target_option_default_node);
13803
13804   struct cl_target_option *callee_opts
13805         = TREE_TARGET_OPTION (callee_tree ? callee_tree
13806                                            : target_option_default_node);
13807
13808   /* Callee's ISA flags should be a subset of the caller's.  */
13809   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
13810        != callee_opts->x_aarch64_isa_flags)
13811     return false;
13812
13813   /* Allow non-strict aligned functions inlining into strict
13814      aligned ones.  */
13815   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
13816        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
13817       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
13818            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
13819     return false;
13820
13821   bool always_inline = lookup_attribute ("always_inline",
13822                                           DECL_ATTRIBUTES (callee));
13823
13824   /* If the architectural features match up and the callee is always_inline
13825      then the other attributes don't matter.  */
13826   if (always_inline)
13827     return true;
13828
13829   if (caller_opts->x_aarch64_cmodel_var
13830       != callee_opts->x_aarch64_cmodel_var)
13831     return false;
13832
13833   if (caller_opts->x_aarch64_tls_dialect
13834       != callee_opts->x_aarch64_tls_dialect)
13835     return false;
13836
13837   /* Honour explicit requests to workaround errata.  */
13838   if (!aarch64_tribools_ok_for_inlining_p (
13839           caller_opts->x_aarch64_fix_a53_err835769,
13840           callee_opts->x_aarch64_fix_a53_err835769,
13841           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
13842     return false;
13843
13844   if (!aarch64_tribools_ok_for_inlining_p (
13845           caller_opts->x_aarch64_fix_a53_err843419,
13846           callee_opts->x_aarch64_fix_a53_err843419,
13847           2, TARGET_FIX_ERR_A53_843419))
13848     return false;
13849
13850   /* If the user explicitly specified -momit-leaf-frame-pointer for the
13851      caller and calle and they don't match up, reject inlining.  */
13852   if (!aarch64_tribools_ok_for_inlining_p (
13853           caller_opts->x_flag_omit_leaf_frame_pointer,
13854           callee_opts->x_flag_omit_leaf_frame_pointer,
13855           2, 1))
13856     return false;
13857
13858   /* If the callee has specific tuning overrides, respect them.  */
13859   if (callee_opts->x_aarch64_override_tune_string != NULL
13860       && caller_opts->x_aarch64_override_tune_string == NULL)
13861     return false;
13862
13863   /* If the user specified tuning override strings for the
13864      caller and callee and they don't match up, reject inlining.
13865      We just do a string compare here, we don't analyze the meaning
13866      of the string, as it would be too costly for little gain.  */
13867   if (callee_opts->x_aarch64_override_tune_string
13868       && caller_opts->x_aarch64_override_tune_string
13869       && (strcmp (callee_opts->x_aarch64_override_tune_string,
13870                   caller_opts->x_aarch64_override_tune_string) != 0))
13871     return false;
13872
13873   return true;
13874 }
13875
13876 /* Return true if SYMBOL_REF X binds locally.  */
13877
13878 static bool
13879 aarch64_symbol_binds_local_p (const_rtx x)
13880 {
13881   return (SYMBOL_REF_DECL (x)
13882           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
13883           : SYMBOL_REF_LOCAL_P (x));
13884 }
13885
13886 /* Return true if SYMBOL_REF X is thread local */
13887 static bool
13888 aarch64_tls_symbol_p (rtx x)
13889 {
13890   if (! TARGET_HAVE_TLS)
13891     return false;
13892
13893   if (GET_CODE (x) != SYMBOL_REF)
13894     return false;
13895
13896   return SYMBOL_REF_TLS_MODEL (x) != 0;
13897 }
13898
13899 /* Classify a TLS symbol into one of the TLS kinds.  */
13900 enum aarch64_symbol_type
13901 aarch64_classify_tls_symbol (rtx x)
13902 {
13903   enum tls_model tls_kind = tls_symbolic_operand_type (x);
13904
13905   switch (tls_kind)
13906     {
13907     case TLS_MODEL_GLOBAL_DYNAMIC:
13908     case TLS_MODEL_LOCAL_DYNAMIC:
13909       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
13910
13911     case TLS_MODEL_INITIAL_EXEC:
13912       switch (aarch64_cmodel)
13913         {
13914         case AARCH64_CMODEL_TINY:
13915         case AARCH64_CMODEL_TINY_PIC:
13916           return SYMBOL_TINY_TLSIE;
13917         default:
13918           return SYMBOL_SMALL_TLSIE;
13919         }
13920
13921     case TLS_MODEL_LOCAL_EXEC:
13922       if (aarch64_tls_size == 12)
13923         return SYMBOL_TLSLE12;
13924       else if (aarch64_tls_size == 24)
13925         return SYMBOL_TLSLE24;
13926       else if (aarch64_tls_size == 32)
13927         return SYMBOL_TLSLE32;
13928       else if (aarch64_tls_size == 48)
13929         return SYMBOL_TLSLE48;
13930       else
13931         gcc_unreachable ();
13932
13933     case TLS_MODEL_EMULATED:
13934     case TLS_MODEL_NONE:
13935       return SYMBOL_FORCE_TO_MEM;
13936
13937     default:
13938       gcc_unreachable ();
13939     }
13940 }
13941
13942 /* Return the correct method for accessing X + OFFSET, where X is either
13943    a SYMBOL_REF or LABEL_REF.  */
13944
13945 enum aarch64_symbol_type
13946 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
13947 {
13948   if (GET_CODE (x) == LABEL_REF)
13949     {
13950       switch (aarch64_cmodel)
13951         {
13952         case AARCH64_CMODEL_LARGE:
13953           return SYMBOL_FORCE_TO_MEM;
13954
13955         case AARCH64_CMODEL_TINY_PIC:
13956         case AARCH64_CMODEL_TINY:
13957           return SYMBOL_TINY_ABSOLUTE;
13958
13959         case AARCH64_CMODEL_SMALL_SPIC:
13960         case AARCH64_CMODEL_SMALL_PIC:
13961         case AARCH64_CMODEL_SMALL:
13962           return SYMBOL_SMALL_ABSOLUTE;
13963
13964         default:
13965           gcc_unreachable ();
13966         }
13967     }
13968
13969   if (GET_CODE (x) == SYMBOL_REF)
13970     {
13971       if (aarch64_tls_symbol_p (x))
13972         return aarch64_classify_tls_symbol (x);
13973
13974       switch (aarch64_cmodel)
13975         {
13976         case AARCH64_CMODEL_TINY:
13977           /* When we retrieve symbol + offset address, we have to make sure
13978              the offset does not cause overflow of the final address.  But
13979              we have no way of knowing the address of symbol at compile time
13980              so we can't accurately say if the distance between the PC and
13981              symbol + offset is outside the addressible range of +/-1M in the
13982              TINY code model.  So we rely on images not being greater than
13983              1M and cap the offset at 1M and anything beyond 1M will have to
13984              be loaded using an alternative mechanism.  Furthermore if the
13985              symbol is a weak reference to something that isn't known to
13986              resolve to a symbol in this module, then force to memory.  */
13987           if ((SYMBOL_REF_WEAK (x)
13988                && !aarch64_symbol_binds_local_p (x))
13989               || !IN_RANGE (offset, -1048575, 1048575))
13990             return SYMBOL_FORCE_TO_MEM;
13991           return SYMBOL_TINY_ABSOLUTE;
13992
13993         case AARCH64_CMODEL_SMALL:
13994           /* Same reasoning as the tiny code model, but the offset cap here is
13995              4G.  */
13996           if ((SYMBOL_REF_WEAK (x)
13997                && !aarch64_symbol_binds_local_p (x))
13998               || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
13999                             HOST_WIDE_INT_C (4294967264)))
14000             return SYMBOL_FORCE_TO_MEM;
14001           return SYMBOL_SMALL_ABSOLUTE;
14002
14003         case AARCH64_CMODEL_TINY_PIC:
14004           if (!aarch64_symbol_binds_local_p (x))
14005             return SYMBOL_TINY_GOT;
14006           return SYMBOL_TINY_ABSOLUTE;
14007
14008         case AARCH64_CMODEL_SMALL_SPIC:
14009         case AARCH64_CMODEL_SMALL_PIC:
14010           if (!aarch64_symbol_binds_local_p (x))
14011             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
14012                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
14013           return SYMBOL_SMALL_ABSOLUTE;
14014
14015         case AARCH64_CMODEL_LARGE:
14016           /* This is alright even in PIC code as the constant
14017              pool reference is always PC relative and within
14018              the same translation unit.  */
14019           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
14020             return SYMBOL_SMALL_ABSOLUTE;
14021           else
14022             return SYMBOL_FORCE_TO_MEM;
14023
14024         default:
14025           gcc_unreachable ();
14026         }
14027     }
14028
14029   /* By default push everything into the constant pool.  */
14030   return SYMBOL_FORCE_TO_MEM;
14031 }
14032
14033 bool
14034 aarch64_constant_address_p (rtx x)
14035 {
14036   return (CONSTANT_P (x) && memory_address_p (DImode, x));
14037 }
14038
14039 bool
14040 aarch64_legitimate_pic_operand_p (rtx x)
14041 {
14042   if (GET_CODE (x) == SYMBOL_REF
14043       || (GET_CODE (x) == CONST
14044           && GET_CODE (XEXP (x, 0)) == PLUS
14045           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
14046      return false;
14047
14048   return true;
14049 }
14050
14051 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
14052    that should be rematerialized rather than spilled.  */
14053
14054 static bool
14055 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
14056 {
14057   /* Support CSE and rematerialization of common constants.  */
14058   if (CONST_INT_P (x)
14059       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
14060       || GET_CODE (x) == CONST_VECTOR)
14061     return true;
14062
14063   /* Do not allow vector struct mode constants for Advanced SIMD.
14064      We could support 0 and -1 easily, but they need support in
14065      aarch64-simd.md.  */
14066   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14067   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
14068     return false;
14069
14070   /* Only accept variable-length vector constants if they can be
14071      handled directly.
14072
14073      ??? It would be possible to handle rematerialization of other
14074      constants via secondary reloads.  */
14075   if (vec_flags & VEC_ANY_SVE)
14076     return aarch64_simd_valid_immediate (x, NULL);
14077
14078   if (GET_CODE (x) == HIGH)
14079     x = XEXP (x, 0);
14080
14081   /* Accept polynomial constants that can be calculated by using the
14082      destination of a move as the sole temporary.  Constants that
14083      require a second temporary cannot be rematerialized (they can't be
14084      forced to memory and also aren't legitimate constants).  */
14085   poly_int64 offset;
14086   if (poly_int_rtx_p (x, &offset))
14087     return aarch64_offset_temporaries (false, offset) <= 1;
14088
14089   /* If an offset is being added to something else, we need to allow the
14090      base to be moved into the destination register, meaning that there
14091      are no free temporaries for the offset.  */
14092   x = strip_offset (x, &offset);
14093   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
14094     return false;
14095
14096   /* Do not allow const (plus (anchor_symbol, const_int)).  */
14097   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
14098     return false;
14099
14100   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
14101      so spilling them is better than rematerialization.  */
14102   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
14103     return true;
14104
14105   /* Label references are always constant.  */
14106   if (GET_CODE (x) == LABEL_REF)
14107     return true;
14108
14109   return false;
14110 }
14111
14112 rtx
14113 aarch64_load_tp (rtx target)
14114 {
14115   if (!target
14116       || GET_MODE (target) != Pmode
14117       || !register_operand (target, Pmode))
14118     target = gen_reg_rtx (Pmode);
14119
14120   /* Can return in any reg.  */
14121   emit_insn (gen_aarch64_load_tp_hard (target));
14122   return target;
14123 }
14124
14125 /* On AAPCS systems, this is the "struct __va_list".  */
14126 static GTY(()) tree va_list_type;
14127
14128 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
14129    Return the type to use as __builtin_va_list.
14130
14131    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
14132
14133    struct __va_list
14134    {
14135      void *__stack;
14136      void *__gr_top;
14137      void *__vr_top;
14138      int   __gr_offs;
14139      int   __vr_offs;
14140    };  */
14141
14142 static tree
14143 aarch64_build_builtin_va_list (void)
14144 {
14145   tree va_list_name;
14146   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14147
14148   /* Create the type.  */
14149   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
14150   /* Give it the required name.  */
14151   va_list_name = build_decl (BUILTINS_LOCATION,
14152                              TYPE_DECL,
14153                              get_identifier ("__va_list"),
14154                              va_list_type);
14155   DECL_ARTIFICIAL (va_list_name) = 1;
14156   TYPE_NAME (va_list_type) = va_list_name;
14157   TYPE_STUB_DECL (va_list_type) = va_list_name;
14158
14159   /* Create the fields.  */
14160   f_stack = build_decl (BUILTINS_LOCATION,
14161                         FIELD_DECL, get_identifier ("__stack"),
14162                         ptr_type_node);
14163   f_grtop = build_decl (BUILTINS_LOCATION,
14164                         FIELD_DECL, get_identifier ("__gr_top"),
14165                         ptr_type_node);
14166   f_vrtop = build_decl (BUILTINS_LOCATION,
14167                         FIELD_DECL, get_identifier ("__vr_top"),
14168                         ptr_type_node);
14169   f_groff = build_decl (BUILTINS_LOCATION,
14170                         FIELD_DECL, get_identifier ("__gr_offs"),
14171                         integer_type_node);
14172   f_vroff = build_decl (BUILTINS_LOCATION,
14173                         FIELD_DECL, get_identifier ("__vr_offs"),
14174                         integer_type_node);
14175
14176   /* Tell tree-stdarg pass about our internal offset fields.
14177      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
14178      purpose to identify whether the code is updating va_list internal
14179      offset fields through irregular way.  */
14180   va_list_gpr_counter_field = f_groff;
14181   va_list_fpr_counter_field = f_vroff;
14182
14183   DECL_ARTIFICIAL (f_stack) = 1;
14184   DECL_ARTIFICIAL (f_grtop) = 1;
14185   DECL_ARTIFICIAL (f_vrtop) = 1;
14186   DECL_ARTIFICIAL (f_groff) = 1;
14187   DECL_ARTIFICIAL (f_vroff) = 1;
14188
14189   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
14190   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
14191   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
14192   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
14193   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
14194
14195   TYPE_FIELDS (va_list_type) = f_stack;
14196   DECL_CHAIN (f_stack) = f_grtop;
14197   DECL_CHAIN (f_grtop) = f_vrtop;
14198   DECL_CHAIN (f_vrtop) = f_groff;
14199   DECL_CHAIN (f_groff) = f_vroff;
14200
14201   /* Compute its layout.  */
14202   layout_type (va_list_type);
14203
14204   return va_list_type;
14205 }
14206
14207 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
14208 static void
14209 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
14210 {
14211   const CUMULATIVE_ARGS *cum;
14212   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14213   tree stack, grtop, vrtop, groff, vroff;
14214   tree t;
14215   int gr_save_area_size = cfun->va_list_gpr_size;
14216   int vr_save_area_size = cfun->va_list_fpr_size;
14217   int vr_offset;
14218
14219   cum = &crtl->args.info;
14220   if (cfun->va_list_gpr_size)
14221     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
14222                              cfun->va_list_gpr_size);
14223   if (cfun->va_list_fpr_size)
14224     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
14225                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
14226
14227   if (!TARGET_FLOAT)
14228     {
14229       gcc_assert (cum->aapcs_nvrn == 0);
14230       vr_save_area_size = 0;
14231     }
14232
14233   f_stack = TYPE_FIELDS (va_list_type_node);
14234   f_grtop = DECL_CHAIN (f_stack);
14235   f_vrtop = DECL_CHAIN (f_grtop);
14236   f_groff = DECL_CHAIN (f_vrtop);
14237   f_vroff = DECL_CHAIN (f_groff);
14238
14239   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
14240                   NULL_TREE);
14241   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
14242                   NULL_TREE);
14243   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
14244                   NULL_TREE);
14245   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
14246                   NULL_TREE);
14247   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
14248                   NULL_TREE);
14249
14250   /* Emit code to initialize STACK, which points to the next varargs stack
14251      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
14252      by named arguments.  STACK is 8-byte aligned.  */
14253   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
14254   if (cum->aapcs_stack_size > 0)
14255     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
14256   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
14257   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14258
14259   /* Emit code to initialize GRTOP, the top of the GR save area.
14260      virtual_incoming_args_rtx should have been 16 byte aligned.  */
14261   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
14262   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
14263   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14264
14265   /* Emit code to initialize VRTOP, the top of the VR save area.
14266      This address is gr_save_area_bytes below GRTOP, rounded
14267      down to the next 16-byte boundary.  */
14268   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
14269   vr_offset = ROUND_UP (gr_save_area_size,
14270                         STACK_BOUNDARY / BITS_PER_UNIT);
14271
14272   if (vr_offset)
14273     t = fold_build_pointer_plus_hwi (t, -vr_offset);
14274   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
14275   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14276
14277   /* Emit code to initialize GROFF, the offset from GRTOP of the
14278      next GPR argument.  */
14279   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
14280               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
14281   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14282
14283   /* Likewise emit code to initialize VROFF, the offset from FTOP
14284      of the next VR argument.  */
14285   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
14286               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
14287   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14288 }
14289
14290 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
14291
14292 static tree
14293 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
14294                               gimple_seq *post_p ATTRIBUTE_UNUSED)
14295 {
14296   tree addr;
14297   bool indirect_p;
14298   bool is_ha;           /* is HFA or HVA.  */
14299   bool dw_align;        /* double-word align.  */
14300   machine_mode ag_mode = VOIDmode;
14301   int nregs;
14302   machine_mode mode;
14303
14304   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14305   tree stack, f_top, f_off, off, arg, roundup, on_stack;
14306   HOST_WIDE_INT size, rsize, adjust, align;
14307   tree t, u, cond1, cond2;
14308
14309   indirect_p = pass_va_arg_by_reference (type);
14310   if (indirect_p)
14311     type = build_pointer_type (type);
14312
14313   mode = TYPE_MODE (type);
14314
14315   f_stack = TYPE_FIELDS (va_list_type_node);
14316   f_grtop = DECL_CHAIN (f_stack);
14317   f_vrtop = DECL_CHAIN (f_grtop);
14318   f_groff = DECL_CHAIN (f_vrtop);
14319   f_vroff = DECL_CHAIN (f_groff);
14320
14321   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
14322                   f_stack, NULL_TREE);
14323   size = int_size_in_bytes (type);
14324
14325   bool abi_break;
14326   align
14327     = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
14328
14329   dw_align = false;
14330   adjust = 0;
14331   if (aarch64_vfp_is_call_or_return_candidate (mode,
14332                                                type,
14333                                                &ag_mode,
14334                                                &nregs,
14335                                                &is_ha))
14336     {
14337       /* No frontends can create types with variable-sized modes, so we
14338          shouldn't be asked to pass or return them.  */
14339       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
14340
14341       /* TYPE passed in fp/simd registers.  */
14342       if (!TARGET_FLOAT)
14343         aarch64_err_no_fpadvsimd (mode);
14344
14345       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
14346                       unshare_expr (valist), f_vrtop, NULL_TREE);
14347       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
14348                       unshare_expr (valist), f_vroff, NULL_TREE);
14349
14350       rsize = nregs * UNITS_PER_VREG;
14351
14352       if (is_ha)
14353         {
14354           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
14355             adjust = UNITS_PER_VREG - ag_size;
14356         }
14357       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14358                && size < UNITS_PER_VREG)
14359         {
14360           adjust = UNITS_PER_VREG - size;
14361         }
14362     }
14363   else
14364     {
14365       /* TYPE passed in general registers.  */
14366       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
14367                       unshare_expr (valist), f_grtop, NULL_TREE);
14368       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
14369                       unshare_expr (valist), f_groff, NULL_TREE);
14370       rsize = ROUND_UP (size, UNITS_PER_WORD);
14371       nregs = rsize / UNITS_PER_WORD;
14372
14373       if (align > 8)
14374         {
14375           if (abi_break && warn_psabi)
14376             inform (input_location, "parameter passing for argument of type "
14377                     "%qT changed in GCC 9.1", type);
14378           dw_align = true;
14379         }
14380
14381       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14382           && size < UNITS_PER_WORD)
14383         {
14384           adjust = UNITS_PER_WORD  - size;
14385         }
14386     }
14387
14388   /* Get a local temporary for the field value.  */
14389   off = get_initialized_tmp_var (f_off, pre_p, NULL);
14390
14391   /* Emit code to branch if off >= 0.  */
14392   t = build2 (GE_EXPR, boolean_type_node, off,
14393               build_int_cst (TREE_TYPE (off), 0));
14394   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
14395
14396   if (dw_align)
14397     {
14398       /* Emit: offs = (offs + 15) & -16.  */
14399       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14400                   build_int_cst (TREE_TYPE (off), 15));
14401       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
14402                   build_int_cst (TREE_TYPE (off), -16));
14403       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
14404     }
14405   else
14406     roundup = NULL;
14407
14408   /* Update ap.__[g|v]r_offs  */
14409   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14410               build_int_cst (TREE_TYPE (off), rsize));
14411   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
14412
14413   /* String up.  */
14414   if (roundup)
14415     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14416
14417   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
14418   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
14419               build_int_cst (TREE_TYPE (f_off), 0));
14420   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
14421
14422   /* String up: make sure the assignment happens before the use.  */
14423   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
14424   COND_EXPR_ELSE (cond1) = t;
14425
14426   /* Prepare the trees handling the argument that is passed on the stack;
14427      the top level node will store in ON_STACK.  */
14428   arg = get_initialized_tmp_var (stack, pre_p, NULL);
14429   if (align > 8)
14430     {
14431       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
14432       t = fold_build_pointer_plus_hwi (arg, 15);
14433       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14434                   build_int_cst (TREE_TYPE (t), -16));
14435       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
14436     }
14437   else
14438     roundup = NULL;
14439   /* Advance ap.__stack  */
14440   t = fold_build_pointer_plus_hwi (arg, size + 7);
14441   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14442               build_int_cst (TREE_TYPE (t), -8));
14443   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
14444   /* String up roundup and advance.  */
14445   if (roundup)
14446     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14447   /* String up with arg */
14448   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
14449   /* Big-endianness related address adjustment.  */
14450   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14451       && size < UNITS_PER_WORD)
14452   {
14453     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
14454                 size_int (UNITS_PER_WORD - size));
14455     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
14456   }
14457
14458   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
14459   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
14460
14461   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
14462   t = off;
14463   if (adjust)
14464     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
14465                 build_int_cst (TREE_TYPE (off), adjust));
14466
14467   t = fold_convert (sizetype, t);
14468   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
14469
14470   if (is_ha)
14471     {
14472       /* type ha; // treat as "struct {ftype field[n];}"
14473          ... [computing offs]
14474          for (i = 0; i <nregs; ++i, offs += 16)
14475            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
14476          return ha;  */
14477       int i;
14478       tree tmp_ha, field_t, field_ptr_t;
14479
14480       /* Declare a local variable.  */
14481       tmp_ha = create_tmp_var_raw (type, "ha");
14482       gimple_add_tmp_var (tmp_ha);
14483
14484       /* Establish the base type.  */
14485       switch (ag_mode)
14486         {
14487         case E_SFmode:
14488           field_t = float_type_node;
14489           field_ptr_t = float_ptr_type_node;
14490           break;
14491         case E_DFmode:
14492           field_t = double_type_node;
14493           field_ptr_t = double_ptr_type_node;
14494           break;
14495         case E_TFmode:
14496           field_t = long_double_type_node;
14497           field_ptr_t = long_double_ptr_type_node;
14498           break;
14499         case E_HFmode:
14500           field_t = aarch64_fp16_type_node;
14501           field_ptr_t = aarch64_fp16_ptr_type_node;
14502           break;
14503         case E_V2SImode:
14504         case E_V4SImode:
14505             {
14506               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
14507               field_t = build_vector_type_for_mode (innertype, ag_mode);
14508               field_ptr_t = build_pointer_type (field_t);
14509             }
14510           break;
14511         default:
14512           gcc_assert (0);
14513         }
14514
14515       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
14516       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
14517       addr = t;
14518       t = fold_convert (field_ptr_t, addr);
14519       t = build2 (MODIFY_EXPR, field_t,
14520                   build1 (INDIRECT_REF, field_t, tmp_ha),
14521                   build1 (INDIRECT_REF, field_t, t));
14522
14523       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
14524       for (i = 1; i < nregs; ++i)
14525         {
14526           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
14527           u = fold_convert (field_ptr_t, addr);
14528           u = build2 (MODIFY_EXPR, field_t,
14529                       build2 (MEM_REF, field_t, tmp_ha,
14530                               build_int_cst (field_ptr_t,
14531                                              (i *
14532                                               int_size_in_bytes (field_t)))),
14533                       build1 (INDIRECT_REF, field_t, u));
14534           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
14535         }
14536
14537       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
14538       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
14539     }
14540
14541   COND_EXPR_ELSE (cond2) = t;
14542   addr = fold_convert (build_pointer_type (type), cond1);
14543   addr = build_va_arg_indirect_ref (addr);
14544
14545   if (indirect_p)
14546     addr = build_va_arg_indirect_ref (addr);
14547
14548   return addr;
14549 }
14550
14551 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
14552
14553 static void
14554 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
14555                                 const function_arg_info &arg,
14556                                 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
14557 {
14558   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
14559   CUMULATIVE_ARGS local_cum;
14560   int gr_saved = cfun->va_list_gpr_size;
14561   int vr_saved = cfun->va_list_fpr_size;
14562
14563   /* The caller has advanced CUM up to, but not beyond, the last named
14564      argument.  Advance a local copy of CUM past the last "real" named
14565      argument, to find out how many registers are left over.  */
14566   local_cum = *cum;
14567   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
14568
14569   /* Found out how many registers we need to save.
14570      Honor tree-stdvar analysis results.  */
14571   if (cfun->va_list_gpr_size)
14572     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
14573                     cfun->va_list_gpr_size / UNITS_PER_WORD);
14574   if (cfun->va_list_fpr_size)
14575     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
14576                     cfun->va_list_fpr_size / UNITS_PER_VREG);
14577
14578   if (!TARGET_FLOAT)
14579     {
14580       gcc_assert (local_cum.aapcs_nvrn == 0);
14581       vr_saved = 0;
14582     }
14583
14584   if (!no_rtl)
14585     {
14586       if (gr_saved > 0)
14587         {
14588           rtx ptr, mem;
14589
14590           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
14591           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
14592                                - gr_saved * UNITS_PER_WORD);
14593           mem = gen_frame_mem (BLKmode, ptr);
14594           set_mem_alias_set (mem, get_varargs_alias_set ());
14595
14596           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
14597                                mem, gr_saved);
14598         }
14599       if (vr_saved > 0)
14600         {
14601           /* We can't use move_block_from_reg, because it will use
14602              the wrong mode, storing D regs only.  */
14603           machine_mode mode = TImode;
14604           int off, i, vr_start;
14605
14606           /* Set OFF to the offset from virtual_incoming_args_rtx of
14607              the first vector register.  The VR save area lies below
14608              the GR one, and is aligned to 16 bytes.  */
14609           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
14610                            STACK_BOUNDARY / BITS_PER_UNIT);
14611           off -= vr_saved * UNITS_PER_VREG;
14612
14613           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
14614           for (i = 0; i < vr_saved; ++i)
14615             {
14616               rtx ptr, mem;
14617
14618               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
14619               mem = gen_frame_mem (mode, ptr);
14620               set_mem_alias_set (mem, get_varargs_alias_set ());
14621               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
14622               off += UNITS_PER_VREG;
14623             }
14624         }
14625     }
14626
14627   /* We don't save the size into *PRETEND_SIZE because we want to avoid
14628      any complication of having crtl->args.pretend_args_size changed.  */
14629   cfun->machine->frame.saved_varargs_size
14630     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
14631                  STACK_BOUNDARY / BITS_PER_UNIT)
14632        + vr_saved * UNITS_PER_VREG);
14633 }
14634
14635 static void
14636 aarch64_conditional_register_usage (void)
14637 {
14638   int i;
14639   if (!TARGET_FLOAT)
14640     {
14641       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
14642         {
14643           fixed_regs[i] = 1;
14644           call_used_regs[i] = 1;
14645         }
14646     }
14647   if (!TARGET_SVE)
14648     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
14649       {
14650         fixed_regs[i] = 1;
14651         call_used_regs[i] = 1;
14652       }
14653
14654   /* When tracking speculation, we need a couple of call-clobbered registers
14655      to track the speculation state.  It would be nice to just use
14656      IP0 and IP1, but currently there are numerous places that just
14657      assume these registers are free for other uses (eg pointer
14658      authentication).  */
14659   if (aarch64_track_speculation)
14660     {
14661       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
14662       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
14663       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14664       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14665     }
14666 }
14667
14668 /* Walk down the type tree of TYPE counting consecutive base elements.
14669    If *MODEP is VOIDmode, then set it to the first valid floating point
14670    type.  If a non-floating point type is found, or if a floating point
14671    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
14672    otherwise return the count in the sub-tree.  */
14673 static int
14674 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
14675 {
14676   machine_mode mode;
14677   HOST_WIDE_INT size;
14678
14679   switch (TREE_CODE (type))
14680     {
14681     case REAL_TYPE:
14682       mode = TYPE_MODE (type);
14683       if (mode != DFmode && mode != SFmode
14684           && mode != TFmode && mode != HFmode)
14685         return -1;
14686
14687       if (*modep == VOIDmode)
14688         *modep = mode;
14689
14690       if (*modep == mode)
14691         return 1;
14692
14693       break;
14694
14695     case COMPLEX_TYPE:
14696       mode = TYPE_MODE (TREE_TYPE (type));
14697       if (mode != DFmode && mode != SFmode
14698           && mode != TFmode && mode != HFmode)
14699         return -1;
14700
14701       if (*modep == VOIDmode)
14702         *modep = mode;
14703
14704       if (*modep == mode)
14705         return 2;
14706
14707       break;
14708
14709     case VECTOR_TYPE:
14710       /* Use V2SImode and V4SImode as representatives of all 64-bit
14711          and 128-bit vector types.  */
14712       size = int_size_in_bytes (type);
14713       switch (size)
14714         {
14715         case 8:
14716           mode = V2SImode;
14717           break;
14718         case 16:
14719           mode = V4SImode;
14720           break;
14721         default:
14722           return -1;
14723         }
14724
14725       if (*modep == VOIDmode)
14726         *modep = mode;
14727
14728       /* Vector modes are considered to be opaque: two vectors are
14729          equivalent for the purposes of being homogeneous aggregates
14730          if they are the same size.  */
14731       if (*modep == mode)
14732         return 1;
14733
14734       break;
14735
14736     case ARRAY_TYPE:
14737       {
14738         int count;
14739         tree index = TYPE_DOMAIN (type);
14740
14741         /* Can't handle incomplete types nor sizes that are not
14742            fixed.  */
14743         if (!COMPLETE_TYPE_P (type)
14744             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14745           return -1;
14746
14747         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
14748         if (count == -1
14749             || !index
14750             || !TYPE_MAX_VALUE (index)
14751             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
14752             || !TYPE_MIN_VALUE (index)
14753             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
14754             || count < 0)
14755           return -1;
14756
14757         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
14758                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
14759
14760         /* There must be no padding.  */
14761         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14762                       count * GET_MODE_BITSIZE (*modep)))
14763           return -1;
14764
14765         return count;
14766       }
14767
14768     case RECORD_TYPE:
14769       {
14770         int count = 0;
14771         int sub_count;
14772         tree field;
14773
14774         /* Can't handle incomplete types nor sizes that are not
14775            fixed.  */
14776         if (!COMPLETE_TYPE_P (type)
14777             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14778           return -1;
14779
14780         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14781           {
14782             if (TREE_CODE (field) != FIELD_DECL)
14783               continue;
14784
14785             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14786             if (sub_count < 0)
14787               return -1;
14788             count += sub_count;
14789           }
14790
14791         /* There must be no padding.  */
14792         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14793                       count * GET_MODE_BITSIZE (*modep)))
14794           return -1;
14795
14796         return count;
14797       }
14798
14799     case UNION_TYPE:
14800     case QUAL_UNION_TYPE:
14801       {
14802         /* These aren't very interesting except in a degenerate case.  */
14803         int count = 0;
14804         int sub_count;
14805         tree field;
14806
14807         /* Can't handle incomplete types nor sizes that are not
14808            fixed.  */
14809         if (!COMPLETE_TYPE_P (type)
14810             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14811           return -1;
14812
14813         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14814           {
14815             if (TREE_CODE (field) != FIELD_DECL)
14816               continue;
14817
14818             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14819             if (sub_count < 0)
14820               return -1;
14821             count = count > sub_count ? count : sub_count;
14822           }
14823
14824         /* There must be no padding.  */
14825         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14826                       count * GET_MODE_BITSIZE (*modep)))
14827           return -1;
14828
14829         return count;
14830       }
14831
14832     default:
14833       break;
14834     }
14835
14836   return -1;
14837 }
14838
14839 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
14840    type as described in AAPCS64 \S 4.1.2.
14841
14842    See the comment above aarch64_composite_type_p for the notes on MODE.  */
14843
14844 static bool
14845 aarch64_short_vector_p (const_tree type,
14846                         machine_mode mode)
14847 {
14848   poly_int64 size = -1;
14849
14850   if (type && TREE_CODE (type) == VECTOR_TYPE)
14851     size = int_size_in_bytes (type);
14852   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
14853             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
14854     size = GET_MODE_SIZE (mode);
14855
14856   return known_eq (size, 8) || known_eq (size, 16);
14857 }
14858
14859 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
14860    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
14861    array types.  The C99 floating-point complex types are also considered
14862    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
14863    types, which are GCC extensions and out of the scope of AAPCS64, are
14864    treated as composite types here as well.
14865
14866    Note that MODE itself is not sufficient in determining whether a type
14867    is such a composite type or not.  This is because
14868    stor-layout.c:compute_record_mode may have already changed the MODE
14869    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
14870    structure with only one field may have its MODE set to the mode of the
14871    field.  Also an integer mode whose size matches the size of the
14872    RECORD_TYPE type may be used to substitute the original mode
14873    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
14874    solely relied on.  */
14875
14876 static bool
14877 aarch64_composite_type_p (const_tree type,
14878                           machine_mode mode)
14879 {
14880   if (aarch64_short_vector_p (type, mode))
14881     return false;
14882
14883   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
14884     return true;
14885
14886   if (mode == BLKmode
14887       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
14888       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
14889     return true;
14890
14891   return false;
14892 }
14893
14894 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
14895    shall be passed or returned in simd/fp register(s) (providing these
14896    parameter passing registers are available).
14897
14898    Upon successful return, *COUNT returns the number of needed registers,
14899    *BASE_MODE returns the mode of the individual register and when IS_HAF
14900    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
14901    floating-point aggregate or a homogeneous short-vector aggregate.  */
14902
14903 static bool
14904 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
14905                                          const_tree type,
14906                                          machine_mode *base_mode,
14907                                          int *count,
14908                                          bool *is_ha)
14909 {
14910   machine_mode new_mode = VOIDmode;
14911   bool composite_p = aarch64_composite_type_p (type, mode);
14912
14913   if (is_ha != NULL) *is_ha = false;
14914
14915   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
14916       || aarch64_short_vector_p (type, mode))
14917     {
14918       *count = 1;
14919       new_mode = mode;
14920     }
14921   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
14922     {
14923       if (is_ha != NULL) *is_ha = true;
14924       *count = 2;
14925       new_mode = GET_MODE_INNER (mode);
14926     }
14927   else if (type && composite_p)
14928     {
14929       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
14930
14931       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
14932         {
14933           if (is_ha != NULL) *is_ha = true;
14934           *count = ag_count;
14935         }
14936       else
14937         return false;
14938     }
14939   else
14940     return false;
14941
14942   *base_mode = new_mode;
14943   return true;
14944 }
14945
14946 /* Implement TARGET_STRUCT_VALUE_RTX.  */
14947
14948 static rtx
14949 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
14950                           int incoming ATTRIBUTE_UNUSED)
14951 {
14952   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
14953 }
14954
14955 /* Implements target hook vector_mode_supported_p.  */
14956 static bool
14957 aarch64_vector_mode_supported_p (machine_mode mode)
14958 {
14959   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14960   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
14961 }
14962
14963 /* Return the full-width SVE vector mode for element mode MODE, if one
14964    exists.  */
14965 opt_machine_mode
14966 aarch64_full_sve_mode (scalar_mode mode)
14967 {
14968   switch (mode)
14969     {
14970     case E_DFmode:
14971       return VNx2DFmode;
14972     case E_SFmode:
14973       return VNx4SFmode;
14974     case E_HFmode:
14975       return VNx8HFmode;
14976     case E_DImode:
14977         return VNx2DImode;
14978     case E_SImode:
14979       return VNx4SImode;
14980     case E_HImode:
14981       return VNx8HImode;
14982     case E_QImode:
14983       return VNx16QImode;
14984     default:
14985       return opt_machine_mode ();
14986     }
14987 }
14988
14989 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
14990    if it exists.  */
14991 opt_machine_mode
14992 aarch64_vq_mode (scalar_mode mode)
14993 {
14994   switch (mode)
14995     {
14996     case E_DFmode:
14997       return V2DFmode;
14998     case E_SFmode:
14999       return V4SFmode;
15000     case E_HFmode:
15001       return V8HFmode;
15002     case E_SImode:
15003       return V4SImode;
15004     case E_HImode:
15005       return V8HImode;
15006     case E_QImode:
15007       return V16QImode;
15008     case E_DImode:
15009       return V2DImode;
15010     default:
15011       return opt_machine_mode ();
15012     }
15013 }
15014
15015 /* Return appropriate SIMD container
15016    for MODE within a vector of WIDTH bits.  */
15017 static machine_mode
15018 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
15019 {
15020   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
15021     return aarch64_full_sve_mode (mode).else_mode (word_mode);
15022
15023   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
15024   if (TARGET_SIMD)
15025     {
15026       if (known_eq (width, 128))
15027         return aarch64_vq_mode (mode).else_mode (word_mode);
15028       else
15029         switch (mode)
15030           {
15031           case E_SFmode:
15032             return V2SFmode;
15033           case E_HFmode:
15034             return V4HFmode;
15035           case E_SImode:
15036             return V2SImode;
15037           case E_HImode:
15038             return V4HImode;
15039           case E_QImode:
15040             return V8QImode;
15041           default:
15042             break;
15043           }
15044     }
15045   return word_mode;
15046 }
15047
15048 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
15049 static machine_mode
15050 aarch64_preferred_simd_mode (scalar_mode mode)
15051 {
15052   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
15053   return aarch64_simd_container_mode (mode, bits);
15054 }
15055
15056 /* Return a list of possible vector sizes for the vectorizer
15057    to iterate over.  */
15058 static void
15059 aarch64_autovectorize_vector_sizes (vector_sizes *sizes, bool)
15060 {
15061   if (TARGET_SVE)
15062     sizes->safe_push (BYTES_PER_SVE_VECTOR);
15063   sizes->safe_push (16);
15064   sizes->safe_push (8);
15065 }
15066
15067 /* Implement TARGET_MANGLE_TYPE.  */
15068
15069 static const char *
15070 aarch64_mangle_type (const_tree type)
15071 {
15072   /* The AArch64 ABI documents say that "__va_list" has to be
15073      mangled as if it is in the "std" namespace.  */
15074   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
15075     return "St9__va_list";
15076
15077   /* Half-precision float.  */
15078   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
15079     return "Dh";
15080
15081   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
15082      builtin types.  */
15083   if (TYPE_NAME (type) != NULL)
15084     return aarch64_mangle_builtin_type (type);
15085
15086   /* Use the default mangling.  */
15087   return NULL;
15088 }
15089
15090 /* Find the first rtx_insn before insn that will generate an assembly
15091    instruction.  */
15092
15093 static rtx_insn *
15094 aarch64_prev_real_insn (rtx_insn *insn)
15095 {
15096   if (!insn)
15097     return NULL;
15098
15099   do
15100     {
15101       insn = prev_real_insn (insn);
15102     }
15103   while (insn && recog_memoized (insn) < 0);
15104
15105   return insn;
15106 }
15107
15108 static bool
15109 is_madd_op (enum attr_type t1)
15110 {
15111   unsigned int i;
15112   /* A number of these may be AArch32 only.  */
15113   enum attr_type mlatypes[] = {
15114     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
15115     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
15116     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
15117   };
15118
15119   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
15120     {
15121       if (t1 == mlatypes[i])
15122         return true;
15123     }
15124
15125   return false;
15126 }
15127
15128 /* Check if there is a register dependency between a load and the insn
15129    for which we hold recog_data.  */
15130
15131 static bool
15132 dep_between_memop_and_curr (rtx memop)
15133 {
15134   rtx load_reg;
15135   int opno;
15136
15137   gcc_assert (GET_CODE (memop) == SET);
15138
15139   if (!REG_P (SET_DEST (memop)))
15140     return false;
15141
15142   load_reg = SET_DEST (memop);
15143   for (opno = 1; opno < recog_data.n_operands; opno++)
15144     {
15145       rtx operand = recog_data.operand[opno];
15146       if (REG_P (operand)
15147           && reg_overlap_mentioned_p (load_reg, operand))
15148         return true;
15149
15150     }
15151   return false;
15152 }
15153
15154
15155 /* When working around the Cortex-A53 erratum 835769,
15156    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
15157    instruction and has a preceding memory instruction such that a NOP
15158    should be inserted between them.  */
15159
15160 bool
15161 aarch64_madd_needs_nop (rtx_insn* insn)
15162 {
15163   enum attr_type attr_type;
15164   rtx_insn *prev;
15165   rtx body;
15166
15167   if (!TARGET_FIX_ERR_A53_835769)
15168     return false;
15169
15170   if (!INSN_P (insn) || recog_memoized (insn) < 0)
15171     return false;
15172
15173   attr_type = get_attr_type (insn);
15174   if (!is_madd_op (attr_type))
15175     return false;
15176
15177   prev = aarch64_prev_real_insn (insn);
15178   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
15179      Restore recog state to INSN to avoid state corruption.  */
15180   extract_constrain_insn_cached (insn);
15181
15182   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
15183     return false;
15184
15185   body = single_set (prev);
15186
15187   /* If the previous insn is a memory op and there is no dependency between
15188      it and the DImode madd, emit a NOP between them.  If body is NULL then we
15189      have a complex memory operation, probably a load/store pair.
15190      Be conservative for now and emit a NOP.  */
15191   if (GET_MODE (recog_data.operand[0]) == DImode
15192       && (!body || !dep_between_memop_and_curr (body)))
15193     return true;
15194
15195   return false;
15196
15197 }
15198
15199
15200 /* Implement FINAL_PRESCAN_INSN.  */
15201
15202 void
15203 aarch64_final_prescan_insn (rtx_insn *insn)
15204 {
15205   if (aarch64_madd_needs_nop (insn))
15206     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
15207 }
15208
15209
15210 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
15211    instruction.  */
15212
15213 bool
15214 aarch64_sve_index_immediate_p (rtx base_or_step)
15215 {
15216   return (CONST_INT_P (base_or_step)
15217           && IN_RANGE (INTVAL (base_or_step), -16, 15));
15218 }
15219
15220 /* Return true if X is a valid immediate for the SVE ADD and SUB
15221    instructions.  Negate X first if NEGATE_P is true.  */
15222
15223 bool
15224 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
15225 {
15226   rtx elt;
15227
15228   if (!const_vec_duplicate_p (x, &elt)
15229       || !CONST_INT_P (elt))
15230     return false;
15231
15232   HOST_WIDE_INT val = INTVAL (elt);
15233   if (negate_p)
15234     val = -val;
15235   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
15236
15237   if (val & 0xff)
15238     return IN_RANGE (val, 0, 0xff);
15239   return IN_RANGE (val, 0, 0xff00);
15240 }
15241
15242 /* Return true if X is a valid immediate operand for an SVE logical
15243    instruction such as AND.  */
15244
15245 bool
15246 aarch64_sve_bitmask_immediate_p (rtx x)
15247 {
15248   rtx elt;
15249
15250   return (const_vec_duplicate_p (x, &elt)
15251           && CONST_INT_P (elt)
15252           && aarch64_bitmask_imm (INTVAL (elt),
15253                                   GET_MODE_INNER (GET_MODE (x))));
15254 }
15255
15256 /* Return true if X is a valid immediate for the SVE DUP and CPY
15257    instructions.  */
15258
15259 bool
15260 aarch64_sve_dup_immediate_p (rtx x)
15261 {
15262   x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
15263   if (!CONST_INT_P (x))
15264     return false;
15265
15266   HOST_WIDE_INT val = INTVAL (x);
15267   if (val & 0xff)
15268     return IN_RANGE (val, -0x80, 0x7f);
15269   return IN_RANGE (val, -0x8000, 0x7f00);
15270 }
15271
15272 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
15273    SIGNED_P says whether the operand is signed rather than unsigned.  */
15274
15275 bool
15276 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
15277 {
15278   rtx elt;
15279
15280   return (const_vec_duplicate_p (x, &elt)
15281           && CONST_INT_P (elt)
15282           && (signed_p
15283               ? IN_RANGE (INTVAL (elt), -16, 15)
15284               : IN_RANGE (INTVAL (elt), 0, 127)));
15285 }
15286
15287 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
15288    instruction.  Negate X first if NEGATE_P is true.  */
15289
15290 bool
15291 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
15292 {
15293   rtx elt;
15294   REAL_VALUE_TYPE r;
15295
15296   if (!const_vec_duplicate_p (x, &elt)
15297       || GET_CODE (elt) != CONST_DOUBLE)
15298     return false;
15299
15300   r = *CONST_DOUBLE_REAL_VALUE (elt);
15301
15302   if (negate_p)
15303     r = real_value_negate (&r);
15304
15305   if (real_equal (&r, &dconst1))
15306     return true;
15307   if (real_equal (&r, &dconsthalf))
15308     return true;
15309   return false;
15310 }
15311
15312 /* Return true if X is a valid immediate operand for an SVE FMUL
15313    instruction.  */
15314
15315 bool
15316 aarch64_sve_float_mul_immediate_p (rtx x)
15317 {
15318   rtx elt;
15319
15320   return (const_vec_duplicate_p (x, &elt)
15321           && GET_CODE (elt) == CONST_DOUBLE
15322           && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
15323               || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
15324 }
15325
15326 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
15327    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
15328    is nonnull, use it to describe valid immediates.  */
15329 static bool
15330 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
15331                                     simd_immediate_info *info,
15332                                     enum simd_immediate_check which,
15333                                     simd_immediate_info::insn_type insn)
15334 {
15335   /* Try a 4-byte immediate with LSL.  */
15336   for (unsigned int shift = 0; shift < 32; shift += 8)
15337     if ((val32 & (0xff << shift)) == val32)
15338       {
15339         if (info)
15340           *info = simd_immediate_info (SImode, val32 >> shift, insn,
15341                                        simd_immediate_info::LSL, shift);
15342         return true;
15343       }
15344
15345   /* Try a 2-byte immediate with LSL.  */
15346   unsigned int imm16 = val32 & 0xffff;
15347   if (imm16 == (val32 >> 16))
15348     for (unsigned int shift = 0; shift < 16; shift += 8)
15349       if ((imm16 & (0xff << shift)) == imm16)
15350         {
15351           if (info)
15352             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
15353                                          simd_immediate_info::LSL, shift);
15354           return true;
15355         }
15356
15357   /* Try a 4-byte immediate with MSL, except for cases that MVN
15358      can handle.  */
15359   if (which == AARCH64_CHECK_MOV)
15360     for (unsigned int shift = 8; shift < 24; shift += 8)
15361       {
15362         unsigned int low = (1 << shift) - 1;
15363         if (((val32 & (0xff << shift)) | low) == val32)
15364           {
15365             if (info)
15366               *info = simd_immediate_info (SImode, val32 >> shift, insn,
15367                                            simd_immediate_info::MSL, shift);
15368             return true;
15369           }
15370       }
15371
15372   return false;
15373 }
15374
15375 /* Return true if replicating VAL64 is a valid immediate for the
15376    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
15377    use it to describe valid immediates.  */
15378 static bool
15379 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
15380                                  simd_immediate_info *info,
15381                                  enum simd_immediate_check which)
15382 {
15383   unsigned int val32 = val64 & 0xffffffff;
15384   unsigned int val16 = val64 & 0xffff;
15385   unsigned int val8 = val64 & 0xff;
15386
15387   if (val32 == (val64 >> 32))
15388     {
15389       if ((which & AARCH64_CHECK_ORR) != 0
15390           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
15391                                                  simd_immediate_info::MOV))
15392         return true;
15393
15394       if ((which & AARCH64_CHECK_BIC) != 0
15395           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
15396                                                  simd_immediate_info::MVN))
15397         return true;
15398
15399       /* Try using a replicated byte.  */
15400       if (which == AARCH64_CHECK_MOV
15401           && val16 == (val32 >> 16)
15402           && val8 == (val16 >> 8))
15403         {
15404           if (info)
15405             *info = simd_immediate_info (QImode, val8);
15406           return true;
15407         }
15408     }
15409
15410   /* Try using a bit-to-bytemask.  */
15411   if (which == AARCH64_CHECK_MOV)
15412     {
15413       unsigned int i;
15414       for (i = 0; i < 64; i += 8)
15415         {
15416           unsigned char byte = (val64 >> i) & 0xff;
15417           if (byte != 0 && byte != 0xff)
15418             break;
15419         }
15420       if (i == 64)
15421         {
15422           if (info)
15423             *info = simd_immediate_info (DImode, val64);
15424           return true;
15425         }
15426     }
15427   return false;
15428 }
15429
15430 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
15431    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
15432
15433 static bool
15434 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
15435                              simd_immediate_info *info)
15436 {
15437   scalar_int_mode mode = DImode;
15438   unsigned int val32 = val64 & 0xffffffff;
15439   if (val32 == (val64 >> 32))
15440     {
15441       mode = SImode;
15442       unsigned int val16 = val32 & 0xffff;
15443       if (val16 == (val32 >> 16))
15444         {
15445           mode = HImode;
15446           unsigned int val8 = val16 & 0xff;
15447           if (val8 == (val16 >> 8))
15448             mode = QImode;
15449         }
15450     }
15451   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
15452   if (IN_RANGE (val, -0x80, 0x7f))
15453     {
15454       /* DUP with no shift.  */
15455       if (info)
15456         *info = simd_immediate_info (mode, val);
15457       return true;
15458     }
15459   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
15460     {
15461       /* DUP with LSL #8.  */
15462       if (info)
15463         *info = simd_immediate_info (mode, val);
15464       return true;
15465     }
15466   if (aarch64_bitmask_imm (val64, mode))
15467     {
15468       /* DUPM.  */
15469       if (info)
15470         *info = simd_immediate_info (mode, val);
15471       return true;
15472     }
15473   return false;
15474 }
15475
15476 /* Return true if X is a valid SVE predicate.  If INFO is nonnull, use
15477    it to describe valid immediates.  */
15478
15479 static bool
15480 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
15481 {
15482   if (x == CONST0_RTX (GET_MODE (x)))
15483     {
15484       if (info)
15485         *info = simd_immediate_info (DImode, 0);
15486       return true;
15487     }
15488
15489   /* Analyze the value as a VNx16BImode.  This should be relatively
15490      efficient, since rtx_vector_builder has enough built-in capacity
15491      to store all VLA predicate constants without needing the heap.  */
15492   rtx_vector_builder builder;
15493   if (!aarch64_get_sve_pred_bits (builder, x))
15494     return false;
15495
15496   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
15497   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
15498     {
15499       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
15500       aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
15501       if (pattern != AARCH64_NUM_SVPATTERNS)
15502         {
15503           if (info)
15504             {
15505               scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
15506               *info = simd_immediate_info (int_mode, pattern);
15507             }
15508           return true;
15509         }
15510     }
15511   return false;
15512 }
15513
15514 /* Return true if OP is a valid SIMD immediate for the operation
15515    described by WHICH.  If INFO is nonnull, use it to describe valid
15516    immediates.  */
15517 bool
15518 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
15519                               enum simd_immediate_check which)
15520 {
15521   machine_mode mode = GET_MODE (op);
15522   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15523   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15524     return false;
15525
15526   if (vec_flags & VEC_SVE_PRED)
15527     return aarch64_sve_pred_valid_immediate (op, info);
15528
15529   scalar_mode elt_mode = GET_MODE_INNER (mode);
15530   rtx base, step;
15531   unsigned int n_elts;
15532   if (GET_CODE (op) == CONST_VECTOR
15533       && CONST_VECTOR_DUPLICATE_P (op))
15534     n_elts = CONST_VECTOR_NPATTERNS (op);
15535   else if ((vec_flags & VEC_SVE_DATA)
15536            && const_vec_series_p (op, &base, &step))
15537     {
15538       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
15539       if (!aarch64_sve_index_immediate_p (base)
15540           || !aarch64_sve_index_immediate_p (step))
15541         return false;
15542
15543       if (info)
15544         *info = simd_immediate_info (elt_mode, base, step);
15545       return true;
15546     }
15547   else if (GET_CODE (op) == CONST_VECTOR
15548            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
15549     /* N_ELTS set above.  */;
15550   else
15551     return false;
15552
15553   scalar_float_mode elt_float_mode;
15554   if (n_elts == 1
15555       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
15556     {
15557       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
15558       if (aarch64_float_const_zero_rtx_p (elt)
15559           || aarch64_float_const_representable_p (elt))
15560         {
15561           if (info)
15562             *info = simd_immediate_info (elt_float_mode, elt);
15563           return true;
15564         }
15565     }
15566
15567   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
15568   if (elt_size > 8)
15569     return false;
15570
15571   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
15572
15573   /* Expand the vector constant out into a byte vector, with the least
15574      significant byte of the register first.  */
15575   auto_vec<unsigned char, 16> bytes;
15576   bytes.reserve (n_elts * elt_size);
15577   for (unsigned int i = 0; i < n_elts; i++)
15578     {
15579       /* The vector is provided in gcc endian-neutral fashion.
15580          For aarch64_be Advanced SIMD, it must be laid out in the vector
15581          register in reverse order.  */
15582       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
15583       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
15584
15585       if (elt_mode != elt_int_mode)
15586         elt = gen_lowpart (elt_int_mode, elt);
15587
15588       if (!CONST_INT_P (elt))
15589         return false;
15590
15591       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
15592       for (unsigned int byte = 0; byte < elt_size; byte++)
15593         {
15594           bytes.quick_push (elt_val & 0xff);
15595           elt_val >>= BITS_PER_UNIT;
15596         }
15597     }
15598
15599   /* The immediate must repeat every eight bytes.  */
15600   unsigned int nbytes = bytes.length ();
15601   for (unsigned i = 8; i < nbytes; ++i)
15602     if (bytes[i] != bytes[i - 8])
15603       return false;
15604
15605   /* Get the repeating 8-byte value as an integer.  No endian correction
15606      is needed here because bytes is already in lsb-first order.  */
15607   unsigned HOST_WIDE_INT val64 = 0;
15608   for (unsigned int i = 0; i < 8; i++)
15609     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
15610               << (i * BITS_PER_UNIT));
15611
15612   if (vec_flags & VEC_SVE_DATA)
15613     return aarch64_sve_valid_immediate (val64, info);
15614   else
15615     return aarch64_advsimd_valid_immediate (val64, info, which);
15616 }
15617
15618 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
15619    has a step in the range of INDEX.  Return the index expression if so,
15620    otherwise return null.  */
15621 rtx
15622 aarch64_check_zero_based_sve_index_immediate (rtx x)
15623 {
15624   rtx base, step;
15625   if (const_vec_series_p (x, &base, &step)
15626       && base == const0_rtx
15627       && aarch64_sve_index_immediate_p (step))
15628     return step;
15629   return NULL_RTX;
15630 }
15631
15632 /* Check of immediate shift constants are within range.  */
15633 bool
15634 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
15635 {
15636   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
15637   if (left)
15638     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
15639   else
15640     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
15641 }
15642
15643 /* Return the bitmask CONST_INT to select the bits required by a zero extract
15644    operation of width WIDTH at bit position POS.  */
15645
15646 rtx
15647 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
15648 {
15649   gcc_assert (CONST_INT_P (width));
15650   gcc_assert (CONST_INT_P (pos));
15651
15652   unsigned HOST_WIDE_INT mask
15653     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
15654   return GEN_INT (mask << UINTVAL (pos));
15655 }
15656
15657 bool
15658 aarch64_mov_operand_p (rtx x, machine_mode mode)
15659 {
15660   if (GET_CODE (x) == HIGH
15661       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
15662     return true;
15663
15664   if (CONST_INT_P (x))
15665     return true;
15666
15667   if (VECTOR_MODE_P (GET_MODE (x)))
15668     {
15669       /* Require predicate constants to be VNx16BI before RA, so that we
15670          force everything to have a canonical form.  */
15671       if (!lra_in_progress
15672           && !reload_completed
15673           && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
15674           && GET_MODE (x) != VNx16BImode)
15675         return false;
15676
15677       return aarch64_simd_valid_immediate (x, NULL);
15678     }
15679
15680   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
15681     return true;
15682
15683   if (aarch64_sve_cnt_immediate_p (x))
15684     return true;
15685
15686   return aarch64_classify_symbolic_expression (x)
15687     == SYMBOL_TINY_ABSOLUTE;
15688 }
15689
15690 /* Return a const_int vector of VAL.  */
15691 rtx
15692 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
15693 {
15694   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
15695   return gen_const_vec_duplicate (mode, c);
15696 }
15697
15698 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
15699
15700 bool
15701 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
15702 {
15703   machine_mode vmode;
15704
15705   vmode = aarch64_simd_container_mode (mode, 64);
15706   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
15707   return aarch64_simd_valid_immediate (op_v, NULL);
15708 }
15709
15710 /* Construct and return a PARALLEL RTX vector with elements numbering the
15711    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
15712    the vector - from the perspective of the architecture.  This does not
15713    line up with GCC's perspective on lane numbers, so we end up with
15714    different masks depending on our target endian-ness.  The diagram
15715    below may help.  We must draw the distinction when building masks
15716    which select one half of the vector.  An instruction selecting
15717    architectural low-lanes for a big-endian target, must be described using
15718    a mask selecting GCC high-lanes.
15719
15720                  Big-Endian             Little-Endian
15721
15722 GCC             0   1   2   3           3   2   1   0
15723               | x | x | x | x |       | x | x | x | x |
15724 Architecture    3   2   1   0           3   2   1   0
15725
15726 Low Mask:         { 2, 3 }                { 0, 1 }
15727 High Mask:        { 0, 1 }                { 2, 3 }
15728
15729    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
15730
15731 rtx
15732 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
15733 {
15734   rtvec v = rtvec_alloc (nunits / 2);
15735   int high_base = nunits / 2;
15736   int low_base = 0;
15737   int base;
15738   rtx t1;
15739   int i;
15740
15741   if (BYTES_BIG_ENDIAN)
15742     base = high ? low_base : high_base;
15743   else
15744     base = high ? high_base : low_base;
15745
15746   for (i = 0; i < nunits / 2; i++)
15747     RTVEC_ELT (v, i) = GEN_INT (base + i);
15748
15749   t1 = gen_rtx_PARALLEL (mode, v);
15750   return t1;
15751 }
15752
15753 /* Check OP for validity as a PARALLEL RTX vector with elements
15754    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
15755    from the perspective of the architecture.  See the diagram above
15756    aarch64_simd_vect_par_cnst_half for more details.  */
15757
15758 bool
15759 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
15760                                        bool high)
15761 {
15762   int nelts;
15763   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
15764     return false;
15765
15766   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
15767   HOST_WIDE_INT count_op = XVECLEN (op, 0);
15768   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
15769   int i = 0;
15770
15771   if (count_op != count_ideal)
15772     return false;
15773
15774   for (i = 0; i < count_ideal; i++)
15775     {
15776       rtx elt_op = XVECEXP (op, 0, i);
15777       rtx elt_ideal = XVECEXP (ideal, 0, i);
15778
15779       if (!CONST_INT_P (elt_op)
15780           || INTVAL (elt_ideal) != INTVAL (elt_op))
15781         return false;
15782     }
15783   return true;
15784 }
15785
15786 /* Return a PARALLEL containing NELTS elements, with element I equal
15787    to BASE + I * STEP.  */
15788
15789 rtx
15790 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
15791 {
15792   rtvec vec = rtvec_alloc (nelts);
15793   for (unsigned int i = 0; i < nelts; ++i)
15794     RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
15795   return gen_rtx_PARALLEL (VOIDmode, vec);
15796 }
15797
15798 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
15799    series with step STEP.  */
15800
15801 bool
15802 aarch64_stepped_int_parallel_p (rtx op, int step)
15803 {
15804   if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
15805     return false;
15806
15807   unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
15808   for (int i = 1; i < XVECLEN (op, 0); ++i)
15809     if (!CONST_INT_P (XVECEXP (op, 0, i))
15810         || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
15811       return false;
15812
15813   return true;
15814 }
15815
15816 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
15817    HIGH (exclusive).  */
15818 void
15819 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
15820                           const_tree exp)
15821 {
15822   HOST_WIDE_INT lane;
15823   gcc_assert (CONST_INT_P (operand));
15824   lane = INTVAL (operand);
15825
15826   if (lane < low || lane >= high)
15827   {
15828     if (exp)
15829       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
15830     else
15831       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
15832   }
15833 }
15834
15835 /* Peform endian correction on lane number N, which indexes a vector
15836    of mode MODE, and return the result as an SImode rtx.  */
15837
15838 rtx
15839 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
15840 {
15841   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
15842 }
15843
15844 /* Return TRUE if OP is a valid vector addressing mode.  */
15845
15846 bool
15847 aarch64_simd_mem_operand_p (rtx op)
15848 {
15849   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
15850                         || REG_P (XEXP (op, 0)));
15851 }
15852
15853 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
15854
15855 bool
15856 aarch64_sve_ld1r_operand_p (rtx op)
15857 {
15858   struct aarch64_address_info addr;
15859   scalar_mode mode;
15860
15861   return (MEM_P (op)
15862           && is_a <scalar_mode> (GET_MODE (op), &mode)
15863           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
15864           && addr.type == ADDRESS_REG_IMM
15865           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
15866 }
15867
15868 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
15869 bool
15870 aarch64_sve_ld1rq_operand_p (rtx op)
15871 {
15872   struct aarch64_address_info addr;
15873   scalar_mode elem_mode = GET_MODE_INNER (GET_MODE (op));
15874   if (!MEM_P (op)
15875       || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
15876     return false;
15877
15878   if (addr.type == ADDRESS_REG_IMM)
15879     return offset_4bit_signed_scaled_p (TImode, addr.const_offset);
15880
15881   if (addr.type == ADDRESS_REG_REG)
15882     return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
15883
15884   return false;
15885 }
15886
15887 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
15888    The conditions for STR are the same.  */
15889 bool
15890 aarch64_sve_ldr_operand_p (rtx op)
15891 {
15892   struct aarch64_address_info addr;
15893
15894   return (MEM_P (op)
15895           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
15896                                        false, ADDR_QUERY_ANY)
15897           && addr.type == ADDRESS_REG_IMM);
15898 }
15899
15900 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
15901    We need to be able to access the individual pieces, so the range
15902    is different from LD[234] and ST[234].  */
15903 bool
15904 aarch64_sve_struct_memory_operand_p (rtx op)
15905 {
15906   if (!MEM_P (op))
15907     return false;
15908
15909   machine_mode mode = GET_MODE (op);
15910   struct aarch64_address_info addr;
15911   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
15912                                  ADDR_QUERY_ANY)
15913       || addr.type != ADDRESS_REG_IMM)
15914     return false;
15915
15916   poly_int64 first = addr.const_offset;
15917   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
15918   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
15919           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
15920 }
15921
15922 /* Emit a register copy from operand to operand, taking care not to
15923    early-clobber source registers in the process.
15924
15925    COUNT is the number of components into which the copy needs to be
15926    decomposed.  */
15927 void
15928 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
15929                                 unsigned int count)
15930 {
15931   unsigned int i;
15932   int rdest = REGNO (operands[0]);
15933   int rsrc = REGNO (operands[1]);
15934
15935   if (!reg_overlap_mentioned_p (operands[0], operands[1])
15936       || rdest < rsrc)
15937     for (i = 0; i < count; i++)
15938       emit_move_insn (gen_rtx_REG (mode, rdest + i),
15939                       gen_rtx_REG (mode, rsrc + i));
15940   else
15941     for (i = 0; i < count; i++)
15942       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
15943                       gen_rtx_REG (mode, rsrc + count - i - 1));
15944 }
15945
15946 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
15947    one of VSTRUCT modes: OI, CI, or XI.  */
15948 int
15949 aarch64_simd_attr_length_rglist (machine_mode mode)
15950 {
15951   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
15952   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
15953 }
15954
15955 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
15956    alignment of a vector to 128 bits.  SVE predicates have an alignment of
15957    16 bits.  */
15958 static HOST_WIDE_INT
15959 aarch64_simd_vector_alignment (const_tree type)
15960 {
15961   /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
15962      be set for non-predicate vectors of booleans.  Modes are the most
15963      direct way we have of identifying real SVE predicate types.  */
15964   if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
15965     return 16;
15966   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15967     return 128;
15968   return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
15969 }
15970
15971 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
15972 static poly_uint64
15973 aarch64_vectorize_preferred_vector_alignment (const_tree type)
15974 {
15975   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
15976     {
15977       /* If the length of the vector is fixed, try to align to that length,
15978          otherwise don't try to align at all.  */
15979       HOST_WIDE_INT result;
15980       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
15981         result = TYPE_ALIGN (TREE_TYPE (type));
15982       return result;
15983     }
15984   return TYPE_ALIGN (type);
15985 }
15986
15987 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
15988 static bool
15989 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
15990 {
15991   if (is_packed)
15992     return false;
15993
15994   /* For fixed-length vectors, check that the vectorizer will aim for
15995      full-vector alignment.  This isn't true for generic GCC vectors
15996      that are wider than the ABI maximum of 128 bits.  */
15997   poly_uint64 preferred_alignment =
15998     aarch64_vectorize_preferred_vector_alignment (type);
15999   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
16000       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
16001                    preferred_alignment))
16002     return false;
16003
16004   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
16005   return true;
16006 }
16007
16008 /* Return true if the vector misalignment factor is supported by the
16009    target.  */
16010 static bool
16011 aarch64_builtin_support_vector_misalignment (machine_mode mode,
16012                                              const_tree type, int misalignment,
16013                                              bool is_packed)
16014 {
16015   if (TARGET_SIMD && STRICT_ALIGNMENT)
16016     {
16017       /* Return if movmisalign pattern is not supported for this mode.  */
16018       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
16019         return false;
16020
16021       /* Misalignment factor is unknown at compile time.  */
16022       if (misalignment == -1)
16023         return false;
16024     }
16025   return default_builtin_support_vector_misalignment (mode, type, misalignment,
16026                                                       is_packed);
16027 }
16028
16029 /* If VALS is a vector constant that can be loaded into a register
16030    using DUP, generate instructions to do so and return an RTX to
16031    assign to the register.  Otherwise return NULL_RTX.  */
16032 static rtx
16033 aarch64_simd_dup_constant (rtx vals)
16034 {
16035   machine_mode mode = GET_MODE (vals);
16036   machine_mode inner_mode = GET_MODE_INNER (mode);
16037   rtx x;
16038
16039   if (!const_vec_duplicate_p (vals, &x))
16040     return NULL_RTX;
16041
16042   /* We can load this constant by using DUP and a constant in a
16043      single ARM register.  This will be cheaper than a vector
16044      load.  */
16045   x = copy_to_mode_reg (inner_mode, x);
16046   return gen_vec_duplicate (mode, x);
16047 }
16048
16049
16050 /* Generate code to load VALS, which is a PARALLEL containing only
16051    constants (for vec_init) or CONST_VECTOR, efficiently into a
16052    register.  Returns an RTX to copy into the register, or NULL_RTX
16053    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
16054 static rtx
16055 aarch64_simd_make_constant (rtx vals)
16056 {
16057   machine_mode mode = GET_MODE (vals);
16058   rtx const_dup;
16059   rtx const_vec = NULL_RTX;
16060   int n_const = 0;
16061   int i;
16062
16063   if (GET_CODE (vals) == CONST_VECTOR)
16064     const_vec = vals;
16065   else if (GET_CODE (vals) == PARALLEL)
16066     {
16067       /* A CONST_VECTOR must contain only CONST_INTs and
16068          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
16069          Only store valid constants in a CONST_VECTOR.  */
16070       int n_elts = XVECLEN (vals, 0);
16071       for (i = 0; i < n_elts; ++i)
16072         {
16073           rtx x = XVECEXP (vals, 0, i);
16074           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16075             n_const++;
16076         }
16077       if (n_const == n_elts)
16078         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
16079     }
16080   else
16081     gcc_unreachable ();
16082
16083   if (const_vec != NULL_RTX
16084       && aarch64_simd_valid_immediate (const_vec, NULL))
16085     /* Load using MOVI/MVNI.  */
16086     return const_vec;
16087   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
16088     /* Loaded using DUP.  */
16089     return const_dup;
16090   else if (const_vec != NULL_RTX)
16091     /* Load from constant pool. We cannot take advantage of single-cycle
16092        LD1 because we need a PC-relative addressing mode.  */
16093     return const_vec;
16094   else
16095     /* A PARALLEL containing something not valid inside CONST_VECTOR.
16096        We cannot construct an initializer.  */
16097     return NULL_RTX;
16098 }
16099
16100 /* Expand a vector initialisation sequence, such that TARGET is
16101    initialised to contain VALS.  */
16102
16103 void
16104 aarch64_expand_vector_init (rtx target, rtx vals)
16105 {
16106   machine_mode mode = GET_MODE (target);
16107   scalar_mode inner_mode = GET_MODE_INNER (mode);
16108   /* The number of vector elements.  */
16109   int n_elts = XVECLEN (vals, 0);
16110   /* The number of vector elements which are not constant.  */
16111   int n_var = 0;
16112   rtx any_const = NULL_RTX;
16113   /* The first element of vals.  */
16114   rtx v0 = XVECEXP (vals, 0, 0);
16115   bool all_same = true;
16116
16117   /* This is a special vec_init<M><N> where N is not an element mode but a
16118      vector mode with half the elements of M.  We expect to find two entries
16119      of mode N in VALS and we must put their concatentation into TARGET.  */
16120   if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
16121     {
16122       gcc_assert (known_eq (GET_MODE_SIZE (mode),
16123                   2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
16124       rtx lo = XVECEXP (vals, 0, 0);
16125       rtx hi = XVECEXP (vals, 0, 1);
16126       machine_mode narrow_mode = GET_MODE (lo);
16127       gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
16128       gcc_assert (narrow_mode == GET_MODE (hi));
16129
16130       /* When we want to concatenate a half-width vector with zeroes we can
16131          use the aarch64_combinez[_be] patterns.  Just make sure that the
16132          zeroes are in the right half.  */
16133       if (BYTES_BIG_ENDIAN
16134           && aarch64_simd_imm_zero (lo, narrow_mode)
16135           && general_operand (hi, narrow_mode))
16136         emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
16137       else if (!BYTES_BIG_ENDIAN
16138                && aarch64_simd_imm_zero (hi, narrow_mode)
16139                && general_operand (lo, narrow_mode))
16140         emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
16141       else
16142         {
16143           /* Else create the two half-width registers and combine them.  */
16144           if (!REG_P (lo))
16145             lo = force_reg (GET_MODE (lo), lo);
16146           if (!REG_P (hi))
16147             hi = force_reg (GET_MODE (hi), hi);
16148
16149           if (BYTES_BIG_ENDIAN)
16150             std::swap (lo, hi);
16151           emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
16152         }
16153      return;
16154    }
16155
16156   /* Count the number of variable elements to initialise.  */
16157   for (int i = 0; i < n_elts; ++i)
16158     {
16159       rtx x = XVECEXP (vals, 0, i);
16160       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
16161         ++n_var;
16162       else
16163         any_const = x;
16164
16165       all_same &= rtx_equal_p (x, v0);
16166     }
16167
16168   /* No variable elements, hand off to aarch64_simd_make_constant which knows
16169      how best to handle this.  */
16170   if (n_var == 0)
16171     {
16172       rtx constant = aarch64_simd_make_constant (vals);
16173       if (constant != NULL_RTX)
16174         {
16175           emit_move_insn (target, constant);
16176           return;
16177         }
16178     }
16179
16180   /* Splat a single non-constant element if we can.  */
16181   if (all_same)
16182     {
16183       rtx x = copy_to_mode_reg (inner_mode, v0);
16184       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
16185       return;
16186     }
16187
16188   enum insn_code icode = optab_handler (vec_set_optab, mode);
16189   gcc_assert (icode != CODE_FOR_nothing);
16190
16191   /* If there are only variable elements, try to optimize
16192      the insertion using dup for the most common element
16193      followed by insertions.  */
16194
16195   /* The algorithm will fill matches[*][0] with the earliest matching element,
16196      and matches[X][1] with the count of duplicate elements (if X is the
16197      earliest element which has duplicates).  */
16198
16199   if (n_var == n_elts && n_elts <= 16)
16200     {
16201       int matches[16][2] = {0};
16202       for (int i = 0; i < n_elts; i++)
16203         {
16204           for (int j = 0; j <= i; j++)
16205             {
16206               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
16207                 {
16208                   matches[i][0] = j;
16209                   matches[j][1]++;
16210                   break;
16211                 }
16212             }
16213         }
16214       int maxelement = 0;
16215       int maxv = 0;
16216       for (int i = 0; i < n_elts; i++)
16217         if (matches[i][1] > maxv)
16218           {
16219             maxelement = i;
16220             maxv = matches[i][1];
16221           }
16222
16223       /* Create a duplicate of the most common element, unless all elements
16224          are equally useless to us, in which case just immediately set the
16225          vector register using the first element.  */
16226
16227       if (maxv == 1)
16228         {
16229           /* For vectors of two 64-bit elements, we can do even better.  */
16230           if (n_elts == 2
16231               && (inner_mode == E_DImode
16232                   || inner_mode == E_DFmode))
16233
16234             {
16235               rtx x0 = XVECEXP (vals, 0, 0);
16236               rtx x1 = XVECEXP (vals, 0, 1);
16237               /* Combine can pick up this case, but handling it directly
16238                  here leaves clearer RTL.
16239
16240                  This is load_pair_lanes<mode>, and also gives us a clean-up
16241                  for store_pair_lanes<mode>.  */
16242               if (memory_operand (x0, inner_mode)
16243                   && memory_operand (x1, inner_mode)
16244                   && !STRICT_ALIGNMENT
16245                   && rtx_equal_p (XEXP (x1, 0),
16246                                   plus_constant (Pmode,
16247                                                  XEXP (x0, 0),
16248                                                  GET_MODE_SIZE (inner_mode))))
16249                 {
16250                   rtx t;
16251                   if (inner_mode == DFmode)
16252                     t = gen_load_pair_lanesdf (target, x0, x1);
16253                   else
16254                     t = gen_load_pair_lanesdi (target, x0, x1);
16255                   emit_insn (t);
16256                   return;
16257                 }
16258             }
16259           /* The subreg-move sequence below will move into lane zero of the
16260              vector register.  For big-endian we want that position to hold
16261              the last element of VALS.  */
16262           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
16263           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
16264           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
16265         }
16266       else
16267         {
16268           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
16269           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
16270         }
16271
16272       /* Insert the rest.  */
16273       for (int i = 0; i < n_elts; i++)
16274         {
16275           rtx x = XVECEXP (vals, 0, i);
16276           if (matches[i][0] == maxelement)
16277             continue;
16278           x = copy_to_mode_reg (inner_mode, x);
16279           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
16280         }
16281       return;
16282     }
16283
16284   /* Initialise a vector which is part-variable.  We want to first try
16285      to build those lanes which are constant in the most efficient way we
16286      can.  */
16287   if (n_var != n_elts)
16288     {
16289       rtx copy = copy_rtx (vals);
16290
16291       /* Load constant part of vector.  We really don't care what goes into the
16292          parts we will overwrite, but we're more likely to be able to load the
16293          constant efficiently if it has fewer, larger, repeating parts
16294          (see aarch64_simd_valid_immediate).  */
16295       for (int i = 0; i < n_elts; i++)
16296         {
16297           rtx x = XVECEXP (vals, 0, i);
16298           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16299             continue;
16300           rtx subst = any_const;
16301           for (int bit = n_elts / 2; bit > 0; bit /= 2)
16302             {
16303               /* Look in the copied vector, as more elements are const.  */
16304               rtx test = XVECEXP (copy, 0, i ^ bit);
16305               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
16306                 {
16307                   subst = test;
16308                   break;
16309                 }
16310             }
16311           XVECEXP (copy, 0, i) = subst;
16312         }
16313       aarch64_expand_vector_init (target, copy);
16314     }
16315
16316   /* Insert the variable lanes directly.  */
16317   for (int i = 0; i < n_elts; i++)
16318     {
16319       rtx x = XVECEXP (vals, 0, i);
16320       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16321         continue;
16322       x = copy_to_mode_reg (inner_mode, x);
16323       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
16324     }
16325 }
16326
16327 /* Emit RTL corresponding to:
16328    insr TARGET, ELEM.  */
16329
16330 static void
16331 emit_insr (rtx target, rtx elem)
16332 {
16333   machine_mode mode = GET_MODE (target);
16334   scalar_mode elem_mode = GET_MODE_INNER (mode);
16335   elem = force_reg (elem_mode, elem);
16336
16337   insn_code icode = optab_handler (vec_shl_insert_optab, mode);
16338   gcc_assert (icode != CODE_FOR_nothing);
16339   emit_insn (GEN_FCN (icode) (target, target, elem));
16340 }
16341
16342 /* Subroutine of aarch64_sve_expand_vector_init for handling
16343    trailing constants.
16344    This function works as follows:
16345    (a) Create a new vector consisting of trailing constants.
16346    (b) Initialize TARGET with the constant vector using emit_move_insn.
16347    (c) Insert remaining elements in TARGET using insr.
16348    NELTS is the total number of elements in original vector while
16349    while NELTS_REQD is the number of elements that are actually
16350    significant.
16351
16352    ??? The heuristic used is to do above only if number of constants
16353    is at least half the total number of elements.  May need fine tuning.  */
16354
16355 static bool
16356 aarch64_sve_expand_vector_init_handle_trailing_constants
16357  (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
16358 {
16359   machine_mode mode = GET_MODE (target);
16360   scalar_mode elem_mode = GET_MODE_INNER (mode);
16361   int n_trailing_constants = 0;
16362
16363   for (int i = nelts_reqd - 1;
16364        i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
16365        i--)
16366     n_trailing_constants++;
16367
16368   if (n_trailing_constants >= nelts_reqd / 2)
16369     {
16370       rtx_vector_builder v (mode, 1, nelts);
16371       for (int i = 0; i < nelts; i++)
16372         v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
16373       rtx const_vec = v.build ();
16374       emit_move_insn (target, const_vec);
16375
16376       for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
16377         emit_insr (target, builder.elt (i));
16378
16379       return true;
16380     }
16381
16382   return false;
16383 }
16384
16385 /* Subroutine of aarch64_sve_expand_vector_init.
16386    Works as follows:
16387    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
16388    (b) Skip trailing elements from BUILDER, which are the same as
16389        element NELTS_REQD - 1.
16390    (c) Insert earlier elements in reverse order in TARGET using insr.  */
16391
16392 static void
16393 aarch64_sve_expand_vector_init_insert_elems (rtx target,
16394                                              const rtx_vector_builder &builder,
16395                                              int nelts_reqd)
16396 {
16397   machine_mode mode = GET_MODE (target);
16398   scalar_mode elem_mode = GET_MODE_INNER (mode);
16399
16400   struct expand_operand ops[2];
16401   enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
16402   gcc_assert (icode != CODE_FOR_nothing);
16403
16404   create_output_operand (&ops[0], target, mode);
16405   create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
16406   expand_insn (icode, 2, ops);
16407
16408   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16409   for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
16410     emit_insr (target, builder.elt (i));
16411 }
16412
16413 /* Subroutine of aarch64_sve_expand_vector_init to handle case
16414    when all trailing elements of builder are same.
16415    This works as follows:
16416    (a) Use expand_insn interface to broadcast last vector element in TARGET.
16417    (b) Insert remaining elements in TARGET using insr.
16418
16419    ??? The heuristic used is to do above if number of same trailing elements
16420    is at least 3/4 of total number of elements, loosely based on
16421    heuristic from mostly_zeros_p.  May need fine-tuning.  */
16422
16423 static bool
16424 aarch64_sve_expand_vector_init_handle_trailing_same_elem
16425  (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
16426 {
16427   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16428   if (ndups >= (3 * nelts_reqd) / 4)
16429     {
16430       aarch64_sve_expand_vector_init_insert_elems (target, builder,
16431                                                    nelts_reqd - ndups + 1);
16432       return true;
16433     }
16434
16435   return false;
16436 }
16437
16438 /* Initialize register TARGET from BUILDER. NELTS is the constant number
16439    of elements in BUILDER.
16440
16441    The function tries to initialize TARGET from BUILDER if it fits one
16442    of the special cases outlined below.
16443
16444    Failing that, the function divides BUILDER into two sub-vectors:
16445    v_even = even elements of BUILDER;
16446    v_odd = odd elements of BUILDER;
16447
16448    and recursively calls itself with v_even and v_odd.
16449
16450    if (recursive call succeeded for v_even or v_odd)
16451      TARGET = zip (v_even, v_odd)
16452
16453    The function returns true if it managed to build TARGET from BUILDER
16454    with one of the special cases, false otherwise.
16455
16456    Example: {a, 1, b, 2, c, 3, d, 4}
16457
16458    The vector gets divided into:
16459    v_even = {a, b, c, d}
16460    v_odd = {1, 2, 3, 4}
16461
16462    aarch64_sve_expand_vector_init(v_odd) hits case 1 and
16463    initialize tmp2 from constant vector v_odd using emit_move_insn.
16464
16465    aarch64_sve_expand_vector_init(v_even) fails since v_even contains
16466    4 elements, so we construct tmp1 from v_even using insr:
16467    tmp1 = dup(d)
16468    insr tmp1, c
16469    insr tmp1, b
16470    insr tmp1, a
16471
16472    And finally:
16473    TARGET = zip (tmp1, tmp2)
16474    which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
16475
16476 static bool
16477 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
16478                                 int nelts, int nelts_reqd)
16479 {
16480   machine_mode mode = GET_MODE (target);
16481
16482   /* Case 1: Vector contains trailing constants.  */
16483
16484   if (aarch64_sve_expand_vector_init_handle_trailing_constants
16485        (target, builder, nelts, nelts_reqd))
16486     return true;
16487
16488   /* Case 2: Vector contains leading constants.  */
16489
16490   rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
16491   for (int i = 0; i < nelts_reqd; i++)
16492     rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
16493   rev_builder.finalize ();
16494
16495   if (aarch64_sve_expand_vector_init_handle_trailing_constants
16496        (target, rev_builder, nelts, nelts_reqd))
16497     {
16498       emit_insn (gen_aarch64_sve_rev (mode, target, target));
16499       return true;
16500     }
16501
16502   /* Case 3: Vector contains trailing same element.  */
16503
16504   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16505        (target, builder, nelts_reqd))
16506     return true;
16507
16508   /* Case 4: Vector contains leading same element.  */
16509
16510   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16511        (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
16512     {
16513       emit_insn (gen_aarch64_sve_rev (mode, target, target));
16514       return true;
16515     }
16516
16517   /* Avoid recursing below 4-elements.
16518      ??? The threshold 4 may need fine-tuning.  */
16519
16520   if (nelts_reqd <= 4)
16521     return false;
16522
16523   rtx_vector_builder v_even (mode, 1, nelts);
16524   rtx_vector_builder v_odd (mode, 1, nelts);
16525
16526   for (int i = 0; i < nelts * 2; i += 2)
16527     {
16528       v_even.quick_push (builder.elt (i));
16529       v_odd.quick_push (builder.elt (i + 1));
16530     }
16531
16532   v_even.finalize ();
16533   v_odd.finalize ();
16534
16535   rtx tmp1 = gen_reg_rtx (mode);
16536   bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
16537                                                     nelts, nelts_reqd / 2);
16538
16539   rtx tmp2 = gen_reg_rtx (mode);
16540   bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
16541                                                    nelts, nelts_reqd / 2);
16542
16543   if (!did_even_p && !did_odd_p)
16544     return false;
16545
16546   /* Initialize v_even and v_odd using INSR if it didn't match any of the
16547      special cases and zip v_even, v_odd.  */
16548
16549   if (!did_even_p)
16550     aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
16551
16552   if (!did_odd_p)
16553     aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
16554
16555   rtvec v = gen_rtvec (2, tmp1, tmp2);
16556   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
16557   return true;
16558 }
16559
16560 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
16561
16562 void
16563 aarch64_sve_expand_vector_init (rtx target, rtx vals)
16564 {
16565   machine_mode mode = GET_MODE (target);
16566   int nelts = XVECLEN (vals, 0);
16567
16568   rtx_vector_builder v (mode, 1, nelts);
16569   for (int i = 0; i < nelts; i++)
16570     v.quick_push (XVECEXP (vals, 0, i));
16571   v.finalize ();
16572
16573   /* If neither sub-vectors of v could be initialized specially,
16574      then use INSR to insert all elements from v into TARGET.
16575      ??? This might not be optimal for vectors with large
16576      initializers like 16-element or above.
16577      For nelts < 4, it probably isn't useful to handle specially.  */
16578
16579   if (nelts < 4
16580       || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
16581     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
16582 }
16583
16584 /* Check whether VALUE is a vector constant in which every element
16585    is either a power of 2 or a negated power of 2.  If so, return
16586    a constant vector of log2s, and flip CODE between PLUS and MINUS
16587    if VALUE contains negated powers of 2.  Return NULL_RTX otherwise.  */
16588
16589 static rtx
16590 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
16591 {
16592   if (GET_CODE (value) != CONST_VECTOR)
16593     return NULL_RTX;
16594
16595   rtx_vector_builder builder;
16596   if (!builder.new_unary_operation (GET_MODE (value), value, false))
16597     return NULL_RTX;
16598
16599   scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
16600   /* 1 if the result of the multiplication must be negated,
16601      0 if it mustn't, or -1 if we don't yet care.  */
16602   int negate = -1;
16603   unsigned int encoded_nelts = const_vector_encoded_nelts (value);
16604   for (unsigned int i = 0; i < encoded_nelts; ++i)
16605     {
16606       rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
16607       if (!CONST_SCALAR_INT_P (elt))
16608         return NULL_RTX;
16609       rtx_mode_t val (elt, int_mode);
16610       wide_int pow2 = wi::neg (val);
16611       if (val != pow2)
16612         {
16613           /* It matters whether we negate or not.  Make that choice,
16614              and make sure that it's consistent with previous elements.  */
16615           if (negate == !wi::neg_p (val))
16616             return NULL_RTX;
16617           negate = wi::neg_p (val);
16618           if (!negate)
16619             pow2 = val;
16620         }
16621       /* POW2 is now the value that we want to be a power of 2.  */
16622       int shift = wi::exact_log2 (pow2);
16623       if (shift < 0)
16624         return NULL_RTX;
16625       builder.quick_push (gen_int_mode (shift, int_mode));
16626     }
16627   if (negate == -1)
16628     /* PLUS and MINUS are equivalent; canonicalize on PLUS.  */
16629     code = PLUS;
16630   else if (negate == 1)
16631     code = code == PLUS ? MINUS : PLUS;
16632   return builder.build ();
16633 }
16634
16635 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
16636    CODE is PLUS for the former and MINUS for the latter.  OPERANDS is the
16637    operands array, in the same order as for fma_optab.  Return true if
16638    the function emitted all the necessary instructions, false if the caller
16639    should generate the pattern normally with the new OPERANDS array.  */
16640
16641 bool
16642 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
16643 {
16644   machine_mode mode = GET_MODE (operands[0]);
16645   if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
16646     {
16647       rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
16648                                   NULL_RTX, true, OPTAB_DIRECT);
16649       force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
16650                           operands[3], product, operands[0], true,
16651                           OPTAB_DIRECT);
16652       return true;
16653     }
16654   operands[2] = force_reg (mode, operands[2]);
16655   return false;
16656 }
16657
16658 /* Likewise, but for a conditional pattern.  */
16659
16660 bool
16661 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
16662 {
16663   machine_mode mode = GET_MODE (operands[0]);
16664   if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
16665     {
16666       rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
16667                                   NULL_RTX, true, OPTAB_DIRECT);
16668       emit_insn (gen_cond (code, mode, operands[0], operands[1],
16669                            operands[4], product, operands[5]));
16670       return true;
16671     }
16672   operands[3] = force_reg (mode, operands[3]);
16673   return false;
16674 }
16675
16676 static unsigned HOST_WIDE_INT
16677 aarch64_shift_truncation_mask (machine_mode mode)
16678 {
16679   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
16680     return 0;
16681   return GET_MODE_UNIT_BITSIZE (mode) - 1;
16682 }
16683
16684 /* Select a format to encode pointers in exception handling data.  */
16685 int
16686 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
16687 {
16688    int type;
16689    switch (aarch64_cmodel)
16690      {
16691      case AARCH64_CMODEL_TINY:
16692      case AARCH64_CMODEL_TINY_PIC:
16693      case AARCH64_CMODEL_SMALL:
16694      case AARCH64_CMODEL_SMALL_PIC:
16695      case AARCH64_CMODEL_SMALL_SPIC:
16696        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
16697           for everything.  */
16698        type = DW_EH_PE_sdata4;
16699        break;
16700      default:
16701        /* No assumptions here.  8-byte relocs required.  */
16702        type = DW_EH_PE_sdata8;
16703        break;
16704      }
16705    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
16706 }
16707
16708 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
16709
16710 static void
16711 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
16712 {
16713   if (aarch64_simd_decl_p (decl))
16714     {
16715       fprintf (stream, "\t.variant_pcs\t");
16716       assemble_name (stream, name);
16717       fprintf (stream, "\n");
16718     }
16719 }
16720
16721 /* The last .arch and .tune assembly strings that we printed.  */
16722 static std::string aarch64_last_printed_arch_string;
16723 static std::string aarch64_last_printed_tune_string;
16724
16725 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
16726    by the function fndecl.  */
16727
16728 void
16729 aarch64_declare_function_name (FILE *stream, const char* name,
16730                                 tree fndecl)
16731 {
16732   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
16733
16734   struct cl_target_option *targ_options;
16735   if (target_parts)
16736     targ_options = TREE_TARGET_OPTION (target_parts);
16737   else
16738     targ_options = TREE_TARGET_OPTION (target_option_current_node);
16739   gcc_assert (targ_options);
16740
16741   const struct processor *this_arch
16742     = aarch64_get_arch (targ_options->x_explicit_arch);
16743
16744   uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
16745   std::string extension
16746     = aarch64_get_extension_string_for_isa_flags (isa_flags,
16747                                                   this_arch->flags);
16748   /* Only update the assembler .arch string if it is distinct from the last
16749      such string we printed.  */
16750   std::string to_print = this_arch->name + extension;
16751   if (to_print != aarch64_last_printed_arch_string)
16752     {
16753       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
16754       aarch64_last_printed_arch_string = to_print;
16755     }
16756
16757   /* Print the cpu name we're tuning for in the comments, might be
16758      useful to readers of the generated asm.  Do it only when it changes
16759      from function to function and verbose assembly is requested.  */
16760   const struct processor *this_tune
16761     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
16762
16763   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
16764     {
16765       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
16766                    this_tune->name);
16767       aarch64_last_printed_tune_string = this_tune->name;
16768     }
16769
16770   aarch64_asm_output_variant_pcs (stream, fndecl, name);
16771
16772   /* Don't forget the type directive for ELF.  */
16773   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
16774   ASM_OUTPUT_LABEL (stream, name);
16775 }
16776
16777 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
16778
16779 void
16780 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
16781 {
16782   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
16783   const char *value = IDENTIFIER_POINTER (target);
16784   aarch64_asm_output_variant_pcs (stream, decl, name);
16785   ASM_OUTPUT_DEF (stream, name, value);
16786 }
16787
16788 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
16789    function symbol references.  */
16790
16791 void
16792 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
16793 {
16794   default_elf_asm_output_external (stream, decl, name);
16795   aarch64_asm_output_variant_pcs (stream, decl, name);
16796 }
16797
16798 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
16799    Used to output the .cfi_b_key_frame directive when signing the current
16800    function with the B key.  */
16801
16802 void
16803 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
16804 {
16805   if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
16806       && aarch64_ra_sign_key == AARCH64_KEY_B)
16807         asm_fprintf (f, "\t.cfi_b_key_frame\n");
16808 }
16809
16810 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
16811
16812 static void
16813 aarch64_start_file (void)
16814 {
16815   struct cl_target_option *default_options
16816     = TREE_TARGET_OPTION (target_option_default_node);
16817
16818   const struct processor *default_arch
16819     = aarch64_get_arch (default_options->x_explicit_arch);
16820   uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
16821   std::string extension
16822     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
16823                                                   default_arch->flags);
16824
16825    aarch64_last_printed_arch_string = default_arch->name + extension;
16826    aarch64_last_printed_tune_string = "";
16827    asm_fprintf (asm_out_file, "\t.arch %s\n",
16828                 aarch64_last_printed_arch_string.c_str ());
16829
16830    default_file_start ();
16831 }
16832
16833 /* Emit load exclusive.  */
16834
16835 static void
16836 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
16837                              rtx mem, rtx model_rtx)
16838 {
16839   if (mode == TImode)
16840     emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
16841                                                 gen_highpart (DImode, rval),
16842                                                 mem, model_rtx));
16843   else
16844     emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
16845 }
16846
16847 /* Emit store exclusive.  */
16848
16849 static void
16850 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
16851                               rtx mem, rtx rval, rtx model_rtx)
16852 {
16853   if (mode == TImode)
16854     emit_insn (gen_aarch64_store_exclusive_pair
16855                (bval, mem, operand_subword (rval, 0, 0, TImode),
16856                 operand_subword (rval, 1, 0, TImode), model_rtx));
16857   else
16858     emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
16859 }
16860
16861 /* Mark the previous jump instruction as unlikely.  */
16862
16863 static void
16864 aarch64_emit_unlikely_jump (rtx insn)
16865 {
16866   rtx_insn *jump = emit_jump_insn (insn);
16867   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
16868 }
16869
16870 /* We store the names of the various atomic helpers in a 5x4 array.
16871    Return the libcall function given MODE, MODEL and NAMES.  */
16872
16873 rtx
16874 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
16875                         const atomic_ool_names *names)
16876 {
16877   memmodel model = memmodel_base (INTVAL (model_rtx));
16878   int mode_idx, model_idx;
16879
16880   switch (mode)
16881     {
16882     case E_QImode:
16883       mode_idx = 0;
16884       break;
16885     case E_HImode:
16886       mode_idx = 1;
16887       break;
16888     case E_SImode:
16889       mode_idx = 2;
16890       break;
16891     case E_DImode:
16892       mode_idx = 3;
16893       break;
16894     case E_TImode:
16895       mode_idx = 4;
16896       break;
16897     default:
16898       gcc_unreachable ();
16899     }
16900
16901   switch (model)
16902     {
16903     case MEMMODEL_RELAXED:
16904       model_idx = 0;
16905       break;
16906     case MEMMODEL_CONSUME:
16907     case MEMMODEL_ACQUIRE:
16908       model_idx = 1;
16909       break;
16910     case MEMMODEL_RELEASE:
16911       model_idx = 2;
16912       break;
16913     case MEMMODEL_ACQ_REL:
16914     case MEMMODEL_SEQ_CST:
16915       model_idx = 3;
16916       break;
16917     default:
16918       gcc_unreachable ();
16919     }
16920
16921   return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
16922                                       VISIBILITY_HIDDEN);
16923 }
16924
16925 #define DEF0(B, N) \
16926   { "__aarch64_" #B #N "_relax", \
16927     "__aarch64_" #B #N "_acq", \
16928     "__aarch64_" #B #N "_rel", \
16929     "__aarch64_" #B #N "_acq_rel" }
16930
16931 #define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
16932                  { NULL, NULL, NULL, NULL }
16933 #define DEF5(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
16934
16935 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
16936 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
16937 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
16938 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
16939 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
16940 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
16941
16942 #undef DEF0
16943 #undef DEF4
16944 #undef DEF5
16945
16946 /* Expand a compare and swap pattern.  */
16947
16948 void
16949 aarch64_expand_compare_and_swap (rtx operands[])
16950 {
16951   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
16952   machine_mode mode, r_mode;
16953
16954   bval = operands[0];
16955   rval = operands[1];
16956   mem = operands[2];
16957   oldval = operands[3];
16958   newval = operands[4];
16959   is_weak = operands[5];
16960   mod_s = operands[6];
16961   mod_f = operands[7];
16962   mode = GET_MODE (mem);
16963
16964   /* Normally the succ memory model must be stronger than fail, but in the
16965      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
16966      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
16967   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
16968       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
16969     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
16970
16971   r_mode = mode;
16972   if (mode == QImode || mode == HImode)
16973     {
16974       r_mode = SImode;
16975       rval = gen_reg_rtx (r_mode);
16976     }
16977
16978   if (TARGET_LSE)
16979     {
16980       /* The CAS insn requires oldval and rval overlap, but we need to
16981          have a copy of oldval saved across the operation to tell if
16982          the operation is successful.  */
16983       if (reg_overlap_mentioned_p (rval, oldval))
16984         rval = copy_to_mode_reg (r_mode, oldval);
16985       else
16986         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
16987
16988       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
16989                                                    newval, mod_s));
16990       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
16991     }
16992   else if (TARGET_OUTLINE_ATOMICS)
16993     {
16994       /* Oldval must satisfy compare afterward.  */
16995       if (!aarch64_plus_operand (oldval, mode))
16996         oldval = force_reg (mode, oldval);
16997       rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
16998       rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
16999                                       oldval, mode, newval, mode,
17000                                       XEXP (mem, 0), Pmode);
17001       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
17002     }
17003   else
17004     {
17005       /* The oldval predicate varies by mode.  Test it and force to reg.  */
17006       insn_code code = code_for_aarch64_compare_and_swap (mode);
17007       if (!insn_data[code].operand[2].predicate (oldval, mode))
17008         oldval = force_reg (mode, oldval);
17009
17010       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
17011                                  is_weak, mod_s, mod_f));
17012       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
17013     }
17014
17015   if (r_mode != mode)
17016     rval = gen_lowpart (mode, rval);
17017   emit_move_insn (operands[1], rval);
17018
17019   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
17020   emit_insn (gen_rtx_SET (bval, x));
17021 }
17022
17023 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
17024    sequence implementing an atomic operation.  */
17025
17026 static void
17027 aarch64_emit_post_barrier (enum memmodel model)
17028 {
17029   const enum memmodel base_model = memmodel_base (model);
17030
17031   if (is_mm_sync (model)
17032       && (base_model == MEMMODEL_ACQUIRE
17033           || base_model == MEMMODEL_ACQ_REL
17034           || base_model == MEMMODEL_SEQ_CST))
17035     {
17036       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
17037     }
17038 }
17039
17040 /* Split a compare and swap pattern.  */
17041
17042 void
17043 aarch64_split_compare_and_swap (rtx operands[])
17044 {
17045   rtx rval, mem, oldval, newval, scratch, x, model_rtx;
17046   machine_mode mode;
17047   bool is_weak;
17048   rtx_code_label *label1, *label2;
17049   enum memmodel model;
17050
17051   rval = operands[0];
17052   mem = operands[1];
17053   oldval = operands[2];
17054   newval = operands[3];
17055   is_weak = (operands[4] != const0_rtx);
17056   model_rtx = operands[5];
17057   scratch = operands[7];
17058   mode = GET_MODE (mem);
17059   model = memmodel_from_int (INTVAL (model_rtx));
17060
17061   /* When OLDVAL is zero and we want the strong version we can emit a tighter
17062     loop:
17063     .label1:
17064         LD[A]XR rval, [mem]
17065         CBNZ    rval, .label2
17066         ST[L]XR scratch, newval, [mem]
17067         CBNZ    scratch, .label1
17068     .label2:
17069         CMP     rval, 0.  */
17070   bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
17071                         oldval == const0_rtx && mode != TImode);
17072
17073   label1 = NULL;
17074   if (!is_weak)
17075     {
17076       label1 = gen_label_rtx ();
17077       emit_label (label1);
17078     }
17079   label2 = gen_label_rtx ();
17080
17081   /* The initial load can be relaxed for a __sync operation since a final
17082      barrier will be emitted to stop code hoisting.  */
17083   if (is_mm_sync (model))
17084     aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
17085   else
17086     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
17087
17088   if (strong_zero_p)
17089     x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
17090   else
17091     {
17092       rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
17093       x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
17094     }
17095   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17096                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
17097   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17098
17099   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
17100
17101   if (!is_weak)
17102     {
17103       if (aarch64_track_speculation)
17104         {
17105           /* Emit an explicit compare instruction, so that we can correctly
17106              track the condition codes.  */
17107           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
17108           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
17109         }
17110       else
17111         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
17112
17113       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17114                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
17115       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17116     }
17117   else
17118     aarch64_gen_compare_reg (NE, scratch, const0_rtx);
17119
17120   emit_label (label2);
17121
17122   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
17123      to set the condition flags.  If this is not used it will be removed by
17124      later passes.  */
17125   if (strong_zero_p)
17126     aarch64_gen_compare_reg (NE, rval, const0_rtx);
17127
17128   /* Emit any final barrier needed for a __sync operation.  */
17129   if (is_mm_sync (model))
17130     aarch64_emit_post_barrier (model);
17131 }
17132
17133 /* Split an atomic operation.  */
17134
17135 void
17136 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
17137                          rtx value, rtx model_rtx, rtx cond)
17138 {
17139   machine_mode mode = GET_MODE (mem);
17140   machine_mode wmode = (mode == DImode ? DImode : SImode);
17141   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
17142   const bool is_sync = is_mm_sync (model);
17143   rtx_code_label *label;
17144   rtx x;
17145
17146   /* Split the atomic operation into a sequence.  */
17147   label = gen_label_rtx ();
17148   emit_label (label);
17149
17150   if (new_out)
17151     new_out = gen_lowpart (wmode, new_out);
17152   if (old_out)
17153     old_out = gen_lowpart (wmode, old_out);
17154   else
17155     old_out = new_out;
17156   value = simplify_gen_subreg (wmode, value, mode, 0);
17157
17158   /* The initial load can be relaxed for a __sync operation since a final
17159      barrier will be emitted to stop code hoisting.  */
17160  if (is_sync)
17161     aarch64_emit_load_exclusive (mode, old_out, mem,
17162                                  GEN_INT (MEMMODEL_RELAXED));
17163   else
17164     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
17165
17166   switch (code)
17167     {
17168     case SET:
17169       new_out = value;
17170       break;
17171
17172     case NOT:
17173       x = gen_rtx_AND (wmode, old_out, value);
17174       emit_insn (gen_rtx_SET (new_out, x));
17175       x = gen_rtx_NOT (wmode, new_out);
17176       emit_insn (gen_rtx_SET (new_out, x));
17177       break;
17178
17179     case MINUS:
17180       if (CONST_INT_P (value))
17181         {
17182           value = GEN_INT (-INTVAL (value));
17183           code = PLUS;
17184         }
17185       /* Fall through.  */
17186
17187     default:
17188       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
17189       emit_insn (gen_rtx_SET (new_out, x));
17190       break;
17191     }
17192
17193   aarch64_emit_store_exclusive (mode, cond, mem,
17194                                 gen_lowpart (mode, new_out), model_rtx);
17195
17196   if (aarch64_track_speculation)
17197     {
17198       /* Emit an explicit compare instruction, so that we can correctly
17199          track the condition codes.  */
17200       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
17201       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
17202     }
17203   else
17204     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
17205
17206   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17207                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
17208   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17209
17210   /* Emit any final barrier needed for a __sync operation.  */
17211   if (is_sync)
17212     aarch64_emit_post_barrier (model);
17213 }
17214
17215 static void
17216 aarch64_init_libfuncs (void)
17217 {
17218    /* Half-precision float operations.  The compiler handles all operations
17219      with NULL libfuncs by converting to SFmode.  */
17220
17221   /* Conversions.  */
17222   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
17223   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
17224
17225   /* Arithmetic.  */
17226   set_optab_libfunc (add_optab, HFmode, NULL);
17227   set_optab_libfunc (sdiv_optab, HFmode, NULL);
17228   set_optab_libfunc (smul_optab, HFmode, NULL);
17229   set_optab_libfunc (neg_optab, HFmode, NULL);
17230   set_optab_libfunc (sub_optab, HFmode, NULL);
17231
17232   /* Comparisons.  */
17233   set_optab_libfunc (eq_optab, HFmode, NULL);
17234   set_optab_libfunc (ne_optab, HFmode, NULL);
17235   set_optab_libfunc (lt_optab, HFmode, NULL);
17236   set_optab_libfunc (le_optab, HFmode, NULL);
17237   set_optab_libfunc (ge_optab, HFmode, NULL);
17238   set_optab_libfunc (gt_optab, HFmode, NULL);
17239   set_optab_libfunc (unord_optab, HFmode, NULL);
17240 }
17241
17242 /* Target hook for c_mode_for_suffix.  */
17243 static machine_mode
17244 aarch64_c_mode_for_suffix (char suffix)
17245 {
17246   if (suffix == 'q')
17247     return TFmode;
17248
17249   return VOIDmode;
17250 }
17251
17252 /* We can only represent floating point constants which will fit in
17253    "quarter-precision" values.  These values are characterised by
17254    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
17255    by:
17256
17257    (-1)^s * (n/16) * 2^r
17258
17259    Where:
17260      's' is the sign bit.
17261      'n' is an integer in the range 16 <= n <= 31.
17262      'r' is an integer in the range -3 <= r <= 4.  */
17263
17264 /* Return true iff X can be represented by a quarter-precision
17265    floating point immediate operand X.  Note, we cannot represent 0.0.  */
17266 bool
17267 aarch64_float_const_representable_p (rtx x)
17268 {
17269   /* This represents our current view of how many bits
17270      make up the mantissa.  */
17271   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
17272   int exponent;
17273   unsigned HOST_WIDE_INT mantissa, mask;
17274   REAL_VALUE_TYPE r, m;
17275   bool fail;
17276
17277   x = unwrap_const_vec_duplicate (x);
17278   if (!CONST_DOUBLE_P (x))
17279     return false;
17280
17281   if (GET_MODE (x) == VOIDmode
17282       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
17283     return false;
17284
17285   r = *CONST_DOUBLE_REAL_VALUE (x);
17286
17287   /* We cannot represent infinities, NaNs or +/-zero.  We won't
17288      know if we have +zero until we analyse the mantissa, but we
17289      can reject the other invalid values.  */
17290   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
17291       || REAL_VALUE_MINUS_ZERO (r))
17292     return false;
17293
17294   /* Extract exponent.  */
17295   r = real_value_abs (&r);
17296   exponent = REAL_EXP (&r);
17297
17298   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
17299      highest (sign) bit, with a fixed binary point at bit point_pos.
17300      m1 holds the low part of the mantissa, m2 the high part.
17301      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
17302      bits for the mantissa, this can fail (low bits will be lost).  */
17303   real_ldexp (&m, &r, point_pos - exponent);
17304   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
17305
17306   /* If the low part of the mantissa has bits set we cannot represent
17307      the value.  */
17308   if (w.ulow () != 0)
17309     return false;
17310   /* We have rejected the lower HOST_WIDE_INT, so update our
17311      understanding of how many bits lie in the mantissa and
17312      look only at the high HOST_WIDE_INT.  */
17313   mantissa = w.elt (1);
17314   point_pos -= HOST_BITS_PER_WIDE_INT;
17315
17316   /* We can only represent values with a mantissa of the form 1.xxxx.  */
17317   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
17318   if ((mantissa & mask) != 0)
17319     return false;
17320
17321   /* Having filtered unrepresentable values, we may now remove all
17322      but the highest 5 bits.  */
17323   mantissa >>= point_pos - 5;
17324
17325   /* We cannot represent the value 0.0, so reject it.  This is handled
17326      elsewhere.  */
17327   if (mantissa == 0)
17328     return false;
17329
17330   /* Then, as bit 4 is always set, we can mask it off, leaving
17331      the mantissa in the range [0, 15].  */
17332   mantissa &= ~(1 << 4);
17333   gcc_assert (mantissa <= 15);
17334
17335   /* GCC internally does not use IEEE754-like encoding (where normalized
17336      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
17337      Our mantissa values are shifted 4 places to the left relative to
17338      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
17339      by 5 places to correct for GCC's representation.  */
17340   exponent = 5 - exponent;
17341
17342   return (exponent >= 0 && exponent <= 7);
17343 }
17344
17345 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
17346    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
17347    output MOVI/MVNI, ORR or BIC immediate.  */
17348 char*
17349 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
17350                                    enum simd_immediate_check which)
17351 {
17352   bool is_valid;
17353   static char templ[40];
17354   const char *mnemonic;
17355   const char *shift_op;
17356   unsigned int lane_count = 0;
17357   char element_char;
17358
17359   struct simd_immediate_info info;
17360
17361   /* This will return true to show const_vector is legal for use as either
17362      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
17363      It will also update INFO to show how the immediate should be generated.
17364      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
17365   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
17366   gcc_assert (is_valid);
17367
17368   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
17369   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
17370
17371   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
17372     {
17373       gcc_assert (info.insn == simd_immediate_info::MOV
17374                   && info.u.mov.shift == 0);
17375       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
17376          move immediate path.  */
17377       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
17378         info.u.mov.value = GEN_INT (0);
17379       else
17380         {
17381           const unsigned int buf_size = 20;
17382           char float_buf[buf_size] = {'\0'};
17383           real_to_decimal_for_mode (float_buf,
17384                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17385                                     buf_size, buf_size, 1, info.elt_mode);
17386
17387           if (lane_count == 1)
17388             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
17389           else
17390             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
17391                       lane_count, element_char, float_buf);
17392           return templ;
17393         }
17394     }
17395
17396   gcc_assert (CONST_INT_P (info.u.mov.value));
17397
17398   if (which == AARCH64_CHECK_MOV)
17399     {
17400       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
17401       shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
17402                   ? "msl" : "lsl");
17403       if (lane_count == 1)
17404         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
17405                   mnemonic, UINTVAL (info.u.mov.value));
17406       else if (info.u.mov.shift)
17407         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
17408                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
17409                   element_char, UINTVAL (info.u.mov.value), shift_op,
17410                   info.u.mov.shift);
17411       else
17412         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
17413                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
17414                   element_char, UINTVAL (info.u.mov.value));
17415     }
17416   else
17417     {
17418       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
17419       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
17420       if (info.u.mov.shift)
17421         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
17422                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
17423                   element_char, UINTVAL (info.u.mov.value), "lsl",
17424                   info.u.mov.shift);
17425       else
17426         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
17427                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
17428                   element_char, UINTVAL (info.u.mov.value));
17429     }
17430   return templ;
17431 }
17432
17433 char*
17434 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
17435 {
17436
17437   /* If a floating point number was passed and we desire to use it in an
17438      integer mode do the conversion to integer.  */
17439   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
17440     {
17441       unsigned HOST_WIDE_INT ival;
17442       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
17443           gcc_unreachable ();
17444       immediate = gen_int_mode (ival, mode);
17445     }
17446
17447   machine_mode vmode;
17448   /* use a 64 bit mode for everything except for DI/DF mode, where we use
17449      a 128 bit vector mode.  */
17450   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
17451
17452   vmode = aarch64_simd_container_mode (mode, width);
17453   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
17454   return aarch64_output_simd_mov_immediate (v_op, width);
17455 }
17456
17457 /* Return the output string to use for moving immediate CONST_VECTOR
17458    into an SVE register.  */
17459
17460 char *
17461 aarch64_output_sve_mov_immediate (rtx const_vector)
17462 {
17463   static char templ[40];
17464   struct simd_immediate_info info;
17465   char element_char;
17466
17467   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
17468   gcc_assert (is_valid);
17469
17470   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
17471
17472   machine_mode vec_mode = GET_MODE (const_vector);
17473   if (aarch64_sve_pred_mode_p (vec_mode))
17474     {
17475       static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
17476       if (info.insn == simd_immediate_info::MOV)
17477         {
17478           gcc_assert (info.u.mov.value == const0_rtx);
17479           snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
17480         }
17481       else
17482         {
17483           gcc_assert (info.insn == simd_immediate_info::PTRUE);
17484           unsigned int total_bytes;
17485           if (info.u.pattern == AARCH64_SV_ALL
17486               && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
17487             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
17488                       total_bytes / GET_MODE_SIZE (info.elt_mode));
17489           else
17490             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
17491                       svpattern_token (info.u.pattern));
17492         }
17493       return buf;
17494     }
17495
17496   if (info.insn == simd_immediate_info::INDEX)
17497     {
17498       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
17499                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
17500                 element_char, INTVAL (info.u.index.base),
17501                 INTVAL (info.u.index.step));
17502       return templ;
17503     }
17504
17505   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
17506     {
17507       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
17508         info.u.mov.value = GEN_INT (0);
17509       else
17510         {
17511           const int buf_size = 20;
17512           char float_buf[buf_size] = {};
17513           real_to_decimal_for_mode (float_buf,
17514                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17515                                     buf_size, buf_size, 1, info.elt_mode);
17516
17517           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
17518                     element_char, float_buf);
17519           return templ;
17520         }
17521     }
17522
17523   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
17524             element_char, INTVAL (info.u.mov.value));
17525   return templ;
17526 }
17527
17528 /* Split operands into moves from op[1] + op[2] into op[0].  */
17529
17530 void
17531 aarch64_split_combinev16qi (rtx operands[3])
17532 {
17533   unsigned int dest = REGNO (operands[0]);
17534   unsigned int src1 = REGNO (operands[1]);
17535   unsigned int src2 = REGNO (operands[2]);
17536   machine_mode halfmode = GET_MODE (operands[1]);
17537   unsigned int halfregs = REG_NREGS (operands[1]);
17538   rtx destlo, desthi;
17539
17540   gcc_assert (halfmode == V16QImode);
17541
17542   if (src1 == dest && src2 == dest + halfregs)
17543     {
17544       /* No-op move.  Can't split to nothing; emit something.  */
17545       emit_note (NOTE_INSN_DELETED);
17546       return;
17547     }
17548
17549   /* Preserve register attributes for variable tracking.  */
17550   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
17551   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
17552                                GET_MODE_SIZE (halfmode));
17553
17554   /* Special case of reversed high/low parts.  */
17555   if (reg_overlap_mentioned_p (operands[2], destlo)
17556       && reg_overlap_mentioned_p (operands[1], desthi))
17557     {
17558       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17559       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
17560       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17561     }
17562   else if (!reg_overlap_mentioned_p (operands[2], destlo))
17563     {
17564       /* Try to avoid unnecessary moves if part of the result
17565          is in the right place already.  */
17566       if (src1 != dest)
17567         emit_move_insn (destlo, operands[1]);
17568       if (src2 != dest + halfregs)
17569         emit_move_insn (desthi, operands[2]);
17570     }
17571   else
17572     {
17573       if (src2 != dest + halfregs)
17574         emit_move_insn (desthi, operands[2]);
17575       if (src1 != dest)
17576         emit_move_insn (destlo, operands[1]);
17577     }
17578 }
17579
17580 /* vec_perm support.  */
17581
17582 struct expand_vec_perm_d
17583 {
17584   rtx target, op0, op1;
17585   vec_perm_indices perm;
17586   machine_mode vmode;
17587   unsigned int vec_flags;
17588   bool one_vector_p;
17589   bool testing_p;
17590 };
17591
17592 /* Generate a variable permutation.  */
17593
17594 static void
17595 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
17596 {
17597   machine_mode vmode = GET_MODE (target);
17598   bool one_vector_p = rtx_equal_p (op0, op1);
17599
17600   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
17601   gcc_checking_assert (GET_MODE (op0) == vmode);
17602   gcc_checking_assert (GET_MODE (op1) == vmode);
17603   gcc_checking_assert (GET_MODE (sel) == vmode);
17604   gcc_checking_assert (TARGET_SIMD);
17605
17606   if (one_vector_p)
17607     {
17608       if (vmode == V8QImode)
17609         {
17610           /* Expand the argument to a V16QI mode by duplicating it.  */
17611           rtx pair = gen_reg_rtx (V16QImode);
17612           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
17613           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17614         }
17615       else
17616         {
17617           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
17618         }
17619     }
17620   else
17621     {
17622       rtx pair;
17623
17624       if (vmode == V8QImode)
17625         {
17626           pair = gen_reg_rtx (V16QImode);
17627           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
17628           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17629         }
17630       else
17631         {
17632           pair = gen_reg_rtx (OImode);
17633           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
17634           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
17635         }
17636     }
17637 }
17638
17639 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
17640    NELT is the number of elements in the vector.  */
17641
17642 void
17643 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
17644                          unsigned int nelt)
17645 {
17646   machine_mode vmode = GET_MODE (target);
17647   bool one_vector_p = rtx_equal_p (op0, op1);
17648   rtx mask;
17649
17650   /* The TBL instruction does not use a modulo index, so we must take care
17651      of that ourselves.  */
17652   mask = aarch64_simd_gen_const_vector_dup (vmode,
17653       one_vector_p ? nelt - 1 : 2 * nelt - 1);
17654   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
17655
17656   /* For big-endian, we also need to reverse the index within the vector
17657      (but not which vector).  */
17658   if (BYTES_BIG_ENDIAN)
17659     {
17660       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
17661       if (!one_vector_p)
17662         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
17663       sel = expand_simple_binop (vmode, XOR, sel, mask,
17664                                  NULL, 0, OPTAB_LIB_WIDEN);
17665     }
17666   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
17667 }
17668
17669 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
17670
17671 static void
17672 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
17673 {
17674   emit_insn (gen_rtx_SET (target,
17675                           gen_rtx_UNSPEC (GET_MODE (target),
17676                                           gen_rtvec (2, op0, op1), code)));
17677 }
17678
17679 /* Expand an SVE vec_perm with the given operands.  */
17680
17681 void
17682 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
17683 {
17684   machine_mode data_mode = GET_MODE (target);
17685   machine_mode sel_mode = GET_MODE (sel);
17686   /* Enforced by the pattern condition.  */
17687   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
17688
17689   /* Note: vec_perm indices are supposed to wrap when they go beyond the
17690      size of the two value vectors, i.e. the upper bits of the indices
17691      are effectively ignored.  SVE TBL instead produces 0 for any
17692      out-of-range indices, so we need to modulo all the vec_perm indices
17693      to ensure they are all in range.  */
17694   rtx sel_reg = force_reg (sel_mode, sel);
17695
17696   /* Check if the sel only references the first values vector.  */
17697   if (GET_CODE (sel) == CONST_VECTOR
17698       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
17699     {
17700       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
17701       return;
17702     }
17703
17704   /* Check if the two values vectors are the same.  */
17705   if (rtx_equal_p (op0, op1))
17706     {
17707       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
17708       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17709                                          NULL, 0, OPTAB_DIRECT);
17710       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
17711       return;
17712     }
17713
17714   /* Run TBL on for each value vector and combine the results.  */
17715
17716   rtx res0 = gen_reg_rtx (data_mode);
17717   rtx res1 = gen_reg_rtx (data_mode);
17718   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
17719   if (GET_CODE (sel) != CONST_VECTOR
17720       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
17721     {
17722       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
17723                                                        2 * nunits - 1);
17724       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17725                                      NULL, 0, OPTAB_DIRECT);
17726     }
17727   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
17728   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
17729                                      NULL, 0, OPTAB_DIRECT);
17730   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
17731   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
17732     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
17733   else
17734     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
17735 }
17736
17737 /* Recognize patterns suitable for the TRN instructions.  */
17738 static bool
17739 aarch64_evpc_trn (struct expand_vec_perm_d *d)
17740 {
17741   HOST_WIDE_INT odd;
17742   poly_uint64 nelt = d->perm.length ();
17743   rtx out, in0, in1, x;
17744   machine_mode vmode = d->vmode;
17745
17746   if (GET_MODE_UNIT_SIZE (vmode) > 8)
17747     return false;
17748
17749   /* Note that these are little-endian tests.
17750      We correct for big-endian later.  */
17751   if (!d->perm[0].is_constant (&odd)
17752       || (odd != 0 && odd != 1)
17753       || !d->perm.series_p (0, 2, odd, 2)
17754       || !d->perm.series_p (1, 2, nelt + odd, 2))
17755     return false;
17756
17757   /* Success!  */
17758   if (d->testing_p)
17759     return true;
17760
17761   in0 = d->op0;
17762   in1 = d->op1;
17763   /* We don't need a big-endian lane correction for SVE; see the comment
17764      at the head of aarch64-sve.md for details.  */
17765   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17766     {
17767       x = in0, in0 = in1, in1 = x;
17768       odd = !odd;
17769     }
17770   out = d->target;
17771
17772   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17773                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
17774   return true;
17775 }
17776
17777 /* Recognize patterns suitable for the UZP instructions.  */
17778 static bool
17779 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
17780 {
17781   HOST_WIDE_INT odd;
17782   rtx out, in0, in1, x;
17783   machine_mode vmode = d->vmode;
17784
17785   if (GET_MODE_UNIT_SIZE (vmode) > 8)
17786     return false;
17787
17788   /* Note that these are little-endian tests.
17789      We correct for big-endian later.  */
17790   if (!d->perm[0].is_constant (&odd)
17791       || (odd != 0 && odd != 1)
17792       || !d->perm.series_p (0, 1, odd, 2))
17793     return false;
17794
17795   /* Success!  */
17796   if (d->testing_p)
17797     return true;
17798
17799   in0 = d->op0;
17800   in1 = d->op1;
17801   /* We don't need a big-endian lane correction for SVE; see the comment
17802      at the head of aarch64-sve.md for details.  */
17803   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17804     {
17805       x = in0, in0 = in1, in1 = x;
17806       odd = !odd;
17807     }
17808   out = d->target;
17809
17810   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17811                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
17812   return true;
17813 }
17814
17815 /* Recognize patterns suitable for the ZIP instructions.  */
17816 static bool
17817 aarch64_evpc_zip (struct expand_vec_perm_d *d)
17818 {
17819   unsigned int high;
17820   poly_uint64 nelt = d->perm.length ();
17821   rtx out, in0, in1, x;
17822   machine_mode vmode = d->vmode;
17823
17824   if (GET_MODE_UNIT_SIZE (vmode) > 8)
17825     return false;
17826
17827   /* Note that these are little-endian tests.
17828      We correct for big-endian later.  */
17829   poly_uint64 first = d->perm[0];
17830   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
17831       || !d->perm.series_p (0, 2, first, 1)
17832       || !d->perm.series_p (1, 2, first + nelt, 1))
17833     return false;
17834   high = maybe_ne (first, 0U);
17835
17836   /* Success!  */
17837   if (d->testing_p)
17838     return true;
17839
17840   in0 = d->op0;
17841   in1 = d->op1;
17842   /* We don't need a big-endian lane correction for SVE; see the comment
17843      at the head of aarch64-sve.md for details.  */
17844   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17845     {
17846       x = in0, in0 = in1, in1 = x;
17847       high = !high;
17848     }
17849   out = d->target;
17850
17851   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17852                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
17853   return true;
17854 }
17855
17856 /* Recognize patterns for the EXT insn.  */
17857
17858 static bool
17859 aarch64_evpc_ext (struct expand_vec_perm_d *d)
17860 {
17861   HOST_WIDE_INT location;
17862   rtx offset;
17863
17864   /* The first element always refers to the first vector.
17865      Check if the extracted indices are increasing by one.  */
17866   if (d->vec_flags == VEC_SVE_PRED
17867       || !d->perm[0].is_constant (&location)
17868       || !d->perm.series_p (0, 1, location, 1))
17869     return false;
17870
17871   /* Success! */
17872   if (d->testing_p)
17873     return true;
17874
17875   /* The case where (location == 0) is a no-op for both big- and little-endian,
17876      and is removed by the mid-end at optimization levels -O1 and higher.
17877
17878      We don't need a big-endian lane correction for SVE; see the comment
17879      at the head of aarch64-sve.md for details.  */
17880   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
17881     {
17882       /* After setup, we want the high elements of the first vector (stored
17883          at the LSB end of the register), and the low elements of the second
17884          vector (stored at the MSB end of the register). So swap.  */
17885       std::swap (d->op0, d->op1);
17886       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
17887          to_constant () is safe since this is restricted to Advanced SIMD
17888          vectors.  */
17889       location = d->perm.length ().to_constant () - location;
17890     }
17891
17892   offset = GEN_INT (location);
17893   emit_set_insn (d->target,
17894                  gen_rtx_UNSPEC (d->vmode,
17895                                  gen_rtvec (3, d->op0, d->op1, offset),
17896                                  UNSPEC_EXT));
17897   return true;
17898 }
17899
17900 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
17901    within each 64-bit, 32-bit or 16-bit granule.  */
17902
17903 static bool
17904 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
17905 {
17906   HOST_WIDE_INT diff;
17907   unsigned int i, size, unspec;
17908   machine_mode pred_mode;
17909
17910   if (d->vec_flags == VEC_SVE_PRED
17911       || !d->one_vector_p
17912       || !d->perm[0].is_constant (&diff))
17913     return false;
17914
17915   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
17916   if (size == 8)
17917     {
17918       unspec = UNSPEC_REV64;
17919       pred_mode = VNx2BImode;
17920     }
17921   else if (size == 4)
17922     {
17923       unspec = UNSPEC_REV32;
17924       pred_mode = VNx4BImode;
17925     }
17926   else if (size == 2)
17927     {
17928       unspec = UNSPEC_REV16;
17929       pred_mode = VNx8BImode;
17930     }
17931   else
17932     return false;
17933
17934   unsigned int step = diff + 1;
17935   for (i = 0; i < step; ++i)
17936     if (!d->perm.series_p (i, step, diff - i, step))
17937       return false;
17938
17939   /* Success! */
17940   if (d->testing_p)
17941     return true;
17942
17943   if (d->vec_flags == VEC_SVE_DATA)
17944     {
17945       machine_mode int_mode = aarch64_sve_int_mode (pred_mode);
17946       rtx target = gen_reg_rtx (int_mode);
17947       if (BYTES_BIG_ENDIAN)
17948         /* The act of taking a subreg between INT_MODE and d->vmode
17949            is itself a reversing operation on big-endian targets;
17950            see the comment at the head of aarch64-sve.md for details.
17951            First reinterpret OP0 as INT_MODE without using a subreg
17952            and without changing the contents.  */
17953         emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0));
17954       else
17955         {
17956           /* For SVE we use REV[BHW] unspecs derived from the element size
17957              of v->mode and vector modes whose elements have SIZE bytes.
17958              This ensures that the vector modes match the predicate modes.  */
17959           int unspec = aarch64_sve_rev_unspec (d->vmode);
17960           rtx pred = aarch64_ptrue_reg (pred_mode);
17961           emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred,
17962                                        gen_lowpart (int_mode, d->op0)));
17963         }
17964       emit_move_insn (d->target, gen_lowpart (d->vmode, target));
17965       return true;
17966     }
17967   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
17968   emit_set_insn (d->target, src);
17969   return true;
17970 }
17971
17972 /* Recognize patterns for the REV insn, which reverses elements within
17973    a full vector.  */
17974
17975 static bool
17976 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
17977 {
17978   poly_uint64 nelt = d->perm.length ();
17979
17980   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
17981     return false;
17982
17983   if (!d->perm.series_p (0, 1, nelt - 1, -1))
17984     return false;
17985
17986   /* Success! */
17987   if (d->testing_p)
17988     return true;
17989
17990   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
17991   emit_set_insn (d->target, src);
17992   return true;
17993 }
17994
17995 static bool
17996 aarch64_evpc_dup (struct expand_vec_perm_d *d)
17997 {
17998   rtx out = d->target;
17999   rtx in0;
18000   HOST_WIDE_INT elt;
18001   machine_mode vmode = d->vmode;
18002   rtx lane;
18003
18004   if (d->vec_flags == VEC_SVE_PRED
18005       || d->perm.encoding ().encoded_nelts () != 1
18006       || !d->perm[0].is_constant (&elt))
18007     return false;
18008
18009   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
18010     return false;
18011
18012   /* Success! */
18013   if (d->testing_p)
18014     return true;
18015
18016   /* The generic preparation in aarch64_expand_vec_perm_const_1
18017      swaps the operand order and the permute indices if it finds
18018      d->perm[0] to be in the second operand.  Thus, we can always
18019      use d->op0 and need not do any extra arithmetic to get the
18020      correct lane number.  */
18021   in0 = d->op0;
18022   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
18023
18024   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
18025   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
18026   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
18027   return true;
18028 }
18029
18030 static bool
18031 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
18032 {
18033   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
18034   machine_mode vmode = d->vmode;
18035
18036   /* Make sure that the indices are constant.  */
18037   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
18038   for (unsigned int i = 0; i < encoded_nelts; ++i)
18039     if (!d->perm[i].is_constant ())
18040       return false;
18041
18042   if (d->testing_p)
18043     return true;
18044
18045   /* Generic code will try constant permutation twice.  Once with the
18046      original mode and again with the elements lowered to QImode.
18047      So wait and don't do the selector expansion ourselves.  */
18048   if (vmode != V8QImode && vmode != V16QImode)
18049     return false;
18050
18051   /* to_constant is safe since this routine is specific to Advanced SIMD
18052      vectors.  */
18053   unsigned int nelt = d->perm.length ().to_constant ();
18054   for (unsigned int i = 0; i < nelt; ++i)
18055     /* If big-endian and two vectors we end up with a weird mixed-endian
18056        mode on NEON.  Reverse the index within each word but not the word
18057        itself.  to_constant is safe because we checked is_constant above.  */
18058     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
18059                         ? d->perm[i].to_constant () ^ (nelt - 1)
18060                         : d->perm[i].to_constant ());
18061
18062   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
18063   sel = force_reg (vmode, sel);
18064
18065   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
18066   return true;
18067 }
18068
18069 /* Try to implement D using an SVE TBL instruction.  */
18070
18071 static bool
18072 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
18073 {
18074   unsigned HOST_WIDE_INT nelt;
18075
18076   /* Permuting two variable-length vectors could overflow the
18077      index range.  */
18078   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
18079     return false;
18080
18081   if (d->testing_p)
18082     return true;
18083
18084   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
18085   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
18086   if (d->one_vector_p)
18087     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
18088   else
18089     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
18090   return true;
18091 }
18092
18093 /* Try to implement D using SVE SEL instruction.  */
18094
18095 static bool
18096 aarch64_evpc_sel (struct expand_vec_perm_d *d)
18097 {
18098   machine_mode vmode = d->vmode;
18099   int unit_size = GET_MODE_UNIT_SIZE (vmode);
18100
18101   if (d->vec_flags != VEC_SVE_DATA
18102       || unit_size > 8)
18103     return false;
18104
18105   int n_patterns = d->perm.encoding ().npatterns ();
18106   poly_int64 vec_len = d->perm.length ();
18107
18108   for (int i = 0; i < n_patterns; ++i)
18109     if (!known_eq (d->perm[i], i)
18110         && !known_eq (d->perm[i], vec_len + i))
18111       return false;
18112
18113   for (int i = n_patterns; i < n_patterns * 2; i++)
18114     if (!d->perm.series_p (i, n_patterns, i, n_patterns)
18115         && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
18116       return false;
18117
18118   if (d->testing_p)
18119     return true;
18120
18121   machine_mode pred_mode = aarch64_sve_pred_mode (unit_size).require ();
18122
18123   rtx_vector_builder builder (pred_mode, n_patterns, 2);
18124   for (int i = 0; i < n_patterns * 2; i++)
18125     {
18126       rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
18127                                           : CONST0_RTX (BImode);
18128       builder.quick_push (elem);
18129     }
18130
18131   rtx const_vec = builder.build ();
18132   rtx pred = force_reg (pred_mode, const_vec);
18133   emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op1, d->op0, pred));
18134   return true;
18135 }
18136
18137 static bool
18138 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
18139 {
18140   /* The pattern matching functions above are written to look for a small
18141      number to begin the sequence (0, 1, N/2).  If we begin with an index
18142      from the second operand, we can swap the operands.  */
18143   poly_int64 nelt = d->perm.length ();
18144   if (known_ge (d->perm[0], nelt))
18145     {
18146       d->perm.rotate_inputs (1);
18147       std::swap (d->op0, d->op1);
18148     }
18149
18150   if ((d->vec_flags == VEC_ADVSIMD
18151        || d->vec_flags == VEC_SVE_DATA
18152        || d->vec_flags == VEC_SVE_PRED)
18153       && known_gt (nelt, 1))
18154     {
18155       if (aarch64_evpc_rev_local (d))
18156         return true;
18157       else if (aarch64_evpc_rev_global (d))
18158         return true;
18159       else if (aarch64_evpc_ext (d))
18160         return true;
18161       else if (aarch64_evpc_dup (d))
18162         return true;
18163       else if (aarch64_evpc_zip (d))
18164         return true;
18165       else if (aarch64_evpc_uzp (d))
18166         return true;
18167       else if (aarch64_evpc_trn (d))
18168         return true;
18169       else if (aarch64_evpc_sel (d))
18170         return true;
18171       if (d->vec_flags == VEC_SVE_DATA)
18172         return aarch64_evpc_sve_tbl (d);
18173       else if (d->vec_flags == VEC_ADVSIMD)
18174         return aarch64_evpc_tbl (d);
18175     }
18176   return false;
18177 }
18178
18179 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
18180
18181 static bool
18182 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
18183                                   rtx op1, const vec_perm_indices &sel)
18184 {
18185   struct expand_vec_perm_d d;
18186
18187   /* Check whether the mask can be applied to a single vector.  */
18188   if (sel.ninputs () == 1
18189       || (op0 && rtx_equal_p (op0, op1)))
18190     d.one_vector_p = true;
18191   else if (sel.all_from_input_p (0))
18192     {
18193       d.one_vector_p = true;
18194       op1 = op0;
18195     }
18196   else if (sel.all_from_input_p (1))
18197     {
18198       d.one_vector_p = true;
18199       op0 = op1;
18200     }
18201   else
18202     d.one_vector_p = false;
18203
18204   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
18205                      sel.nelts_per_input ());
18206   d.vmode = vmode;
18207   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
18208   d.target = target;
18209   d.op0 = op0;
18210   d.op1 = op1;
18211   d.testing_p = !target;
18212
18213   if (!d.testing_p)
18214     return aarch64_expand_vec_perm_const_1 (&d);
18215
18216   rtx_insn *last = get_last_insn ();
18217   bool ret = aarch64_expand_vec_perm_const_1 (&d);
18218   gcc_assert (last == get_last_insn ());
18219
18220   return ret;
18221 }
18222
18223 /* Generate a byte permute mask for a register of mode MODE,
18224    which has NUNITS units.  */
18225
18226 rtx
18227 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
18228 {
18229   /* We have to reverse each vector because we dont have
18230      a permuted load that can reverse-load according to ABI rules.  */
18231   rtx mask;
18232   rtvec v = rtvec_alloc (16);
18233   unsigned int i, j;
18234   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
18235
18236   gcc_assert (BYTES_BIG_ENDIAN);
18237   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
18238
18239   for (i = 0; i < nunits; i++)
18240     for (j = 0; j < usize; j++)
18241       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
18242   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
18243   return force_reg (V16QImode, mask);
18244 }
18245
18246 /* Expand an SVE integer comparison using the SVE equivalent of:
18247
18248      (set TARGET (CODE OP0 OP1)).  */
18249
18250 void
18251 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
18252 {
18253   machine_mode pred_mode = GET_MODE (target);
18254   machine_mode data_mode = GET_MODE (op0);
18255   rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
18256                                       op0, op1);
18257   if (!rtx_equal_p (target, res))
18258     emit_move_insn (target, res);
18259 }
18260
18261 /* Return the UNSPEC_COND_* code for comparison CODE.  */
18262
18263 static unsigned int
18264 aarch64_unspec_cond_code (rtx_code code)
18265 {
18266   switch (code)
18267     {
18268     case NE:
18269       return UNSPEC_COND_FCMNE;
18270     case EQ:
18271       return UNSPEC_COND_FCMEQ;
18272     case LT:
18273       return UNSPEC_COND_FCMLT;
18274     case GT:
18275       return UNSPEC_COND_FCMGT;
18276     case LE:
18277       return UNSPEC_COND_FCMLE;
18278     case GE:
18279       return UNSPEC_COND_FCMGE;
18280     case UNORDERED:
18281       return UNSPEC_COND_FCMUO;
18282     default:
18283       gcc_unreachable ();
18284     }
18285 }
18286
18287 /* Emit:
18288
18289       (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
18290
18291    where <X> is the operation associated with comparison CODE.
18292    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
18293
18294 static void
18295 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
18296                           bool known_ptrue_p, rtx op0, rtx op1)
18297 {
18298   rtx flag = gen_int_mode (known_ptrue_p, SImode);
18299   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
18300                                gen_rtvec (4, pred, flag, op0, op1),
18301                                aarch64_unspec_cond_code (code));
18302   emit_set_insn (target, unspec);
18303 }
18304
18305 /* Emit the SVE equivalent of:
18306
18307       (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
18308       (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
18309       (set TARGET (ior:PRED_MODE TMP1 TMP2))
18310
18311    where <Xi> is the operation associated with comparison CODEi.
18312    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
18313
18314 static void
18315 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
18316                               rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
18317 {
18318   machine_mode pred_mode = GET_MODE (pred);
18319   rtx tmp1 = gen_reg_rtx (pred_mode);
18320   aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
18321   rtx tmp2 = gen_reg_rtx (pred_mode);
18322   aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
18323   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
18324 }
18325
18326 /* Emit the SVE equivalent of:
18327
18328       (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
18329       (set TARGET (not TMP))
18330
18331    where <X> is the operation associated with comparison CODE.
18332    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
18333
18334 static void
18335 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
18336                                  bool known_ptrue_p, rtx op0, rtx op1)
18337 {
18338   machine_mode pred_mode = GET_MODE (pred);
18339   rtx tmp = gen_reg_rtx (pred_mode);
18340   aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
18341   aarch64_emit_unop (target, one_cmpl_optab, tmp);
18342 }
18343
18344 /* Expand an SVE floating-point comparison using the SVE equivalent of:
18345
18346      (set TARGET (CODE OP0 OP1))
18347
18348    If CAN_INVERT_P is true, the caller can also handle inverted results;
18349    return true if the result is in fact inverted.  */
18350
18351 bool
18352 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
18353                                   rtx op0, rtx op1, bool can_invert_p)
18354 {
18355   machine_mode pred_mode = GET_MODE (target);
18356   machine_mode data_mode = GET_MODE (op0);
18357
18358   rtx ptrue = aarch64_ptrue_reg (pred_mode);
18359   switch (code)
18360     {
18361     case UNORDERED:
18362       /* UNORDERED has no immediate form.  */
18363       op1 = force_reg (data_mode, op1);
18364       /* fall through */
18365     case LT:
18366     case LE:
18367     case GT:
18368     case GE:
18369     case EQ:
18370     case NE:
18371       {
18372         /* There is native support for the comparison.  */
18373         aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
18374         return false;
18375       }
18376
18377     case LTGT:
18378       /* This is a trapping operation (LT or GT).  */
18379       aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
18380       return false;
18381
18382     case UNEQ:
18383       if (!flag_trapping_math)
18384         {
18385           /* This would trap for signaling NaNs.  */
18386           op1 = force_reg (data_mode, op1);
18387           aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
18388                                         ptrue, true, op0, op1);
18389           return false;
18390         }
18391       /* fall through */
18392     case UNLT:
18393     case UNLE:
18394     case UNGT:
18395     case UNGE:
18396       if (flag_trapping_math)
18397         {
18398           /* Work out which elements are ordered.  */
18399           rtx ordered = gen_reg_rtx (pred_mode);
18400           op1 = force_reg (data_mode, op1);
18401           aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
18402                                            ptrue, true, op0, op1);
18403
18404           /* Test the opposite condition for the ordered elements,
18405              then invert the result.  */
18406           if (code == UNEQ)
18407             code = NE;
18408           else
18409             code = reverse_condition_maybe_unordered (code);
18410           if (can_invert_p)
18411             {
18412               aarch64_emit_sve_fp_cond (target, code,
18413                                         ordered, false, op0, op1);
18414               return true;
18415             }
18416           aarch64_emit_sve_invert_fp_cond (target, code,
18417                                            ordered, false, op0, op1);
18418           return false;
18419         }
18420       break;
18421
18422     case ORDERED:
18423       /* ORDERED has no immediate form.  */
18424       op1 = force_reg (data_mode, op1);
18425       break;
18426
18427     default:
18428       gcc_unreachable ();
18429     }
18430
18431   /* There is native support for the inverse comparison.  */
18432   code = reverse_condition_maybe_unordered (code);
18433   if (can_invert_p)
18434     {
18435       aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
18436       return true;
18437     }
18438   aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
18439   return false;
18440 }
18441
18442 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
18443    of the data being selected and CMP_MODE is the mode of the values being
18444    compared.  */
18445
18446 void
18447 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
18448                           rtx *ops)
18449 {
18450   machine_mode pred_mode
18451     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
18452                              GET_MODE_SIZE (cmp_mode)).require ();
18453   rtx pred = gen_reg_rtx (pred_mode);
18454   if (FLOAT_MODE_P (cmp_mode))
18455     {
18456       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
18457                                             ops[4], ops[5], true))
18458         std::swap (ops[1], ops[2]);
18459     }
18460   else
18461     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
18462
18463   if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
18464     ops[1] = force_reg (data_mode, ops[1]);
18465   /* The "false" value can only be zero if the "true" value is a constant.  */
18466   if (register_operand (ops[1], data_mode)
18467       || !aarch64_simd_reg_or_zero (ops[2], data_mode))
18468     ops[2] = force_reg (data_mode, ops[2]);
18469
18470   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
18471   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
18472 }
18473
18474 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
18475    true.  However due to issues with register allocation it is preferable
18476    to avoid tieing integer scalar and FP scalar modes.  Executing integer
18477    operations in general registers is better than treating them as scalar
18478    vector operations.  This reduces latency and avoids redundant int<->FP
18479    moves.  So tie modes if they are either the same class, or vector modes
18480    with other vector modes, vector structs or any scalar mode.  */
18481
18482 static bool
18483 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
18484 {
18485   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
18486     return true;
18487
18488   /* We specifically want to allow elements of "structure" modes to
18489      be tieable to the structure.  This more general condition allows
18490      other rarer situations too.  The reason we don't extend this to
18491      predicate modes is that there are no predicate structure modes
18492      nor any specific instructions for extracting part of a predicate
18493      register.  */
18494   if (aarch64_vector_data_mode_p (mode1)
18495       && aarch64_vector_data_mode_p (mode2))
18496     return true;
18497
18498   /* Also allow any scalar modes with vectors.  */
18499   if (aarch64_vector_mode_supported_p (mode1)
18500       || aarch64_vector_mode_supported_p (mode2))
18501     return true;
18502
18503   return false;
18504 }
18505
18506 /* Return a new RTX holding the result of moving POINTER forward by
18507    AMOUNT bytes.  */
18508
18509 static rtx
18510 aarch64_move_pointer (rtx pointer, poly_int64 amount)
18511 {
18512   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
18513
18514   return adjust_automodify_address (pointer, GET_MODE (pointer),
18515                                     next, amount);
18516 }
18517
18518 /* Return a new RTX holding the result of moving POINTER forward by the
18519    size of the mode it points to.  */
18520
18521 static rtx
18522 aarch64_progress_pointer (rtx pointer)
18523 {
18524   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
18525 }
18526
18527 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
18528    MODE bytes.  */
18529
18530 static void
18531 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
18532                                               machine_mode mode)
18533 {
18534   rtx reg = gen_reg_rtx (mode);
18535
18536   /* "Cast" the pointers to the correct mode.  */
18537   *src = adjust_address (*src, mode, 0);
18538   *dst = adjust_address (*dst, mode, 0);
18539   /* Emit the memcpy.  */
18540   emit_move_insn (reg, *src);
18541   emit_move_insn (*dst, reg);
18542   /* Move the pointers forward.  */
18543   *src = aarch64_progress_pointer (*src);
18544   *dst = aarch64_progress_pointer (*dst);
18545 }
18546
18547 /* Expand cpymem, as if from a __builtin_memcpy.  Return true if
18548    we succeed, otherwise return false.  */
18549
18550 bool
18551 aarch64_expand_cpymem (rtx *operands)
18552 {
18553   int n, mode_bits;
18554   rtx dst = operands[0];
18555   rtx src = operands[1];
18556   rtx base;
18557   machine_mode cur_mode = BLKmode, next_mode;
18558   bool speed_p = !optimize_function_for_size_p (cfun);
18559
18560   /* When optimizing for size, give a better estimate of the length of a
18561      memcpy call, but use the default otherwise.  Moves larger than 8 bytes
18562      will always require an even number of instructions to do now.  And each
18563      operation requires both a load+store, so devide the max number by 2.  */
18564   int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
18565
18566   /* We can't do anything smart if the amount to copy is not constant.  */
18567   if (!CONST_INT_P (operands[2]))
18568     return false;
18569
18570   n = INTVAL (operands[2]);
18571
18572   /* Try to keep the number of instructions low.  For all cases we will do at
18573      most two moves for the residual amount, since we'll always overlap the
18574      remainder.  */
18575   if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
18576     return false;
18577
18578   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
18579   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
18580
18581   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
18582   src = adjust_automodify_address (src, VOIDmode, base, 0);
18583
18584   /* Convert n to bits to make the rest of the code simpler.  */
18585   n = n * BITS_PER_UNIT;
18586
18587   /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
18588      larger than TImode, but we should not use them for loads/stores here.  */
18589   const int copy_limit = GET_MODE_BITSIZE (TImode);
18590
18591   while (n > 0)
18592     {
18593       /* Find the largest mode in which to do the copy in without over reading
18594          or writing.  */
18595       opt_scalar_int_mode mode_iter;
18596       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
18597         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
18598           cur_mode = mode_iter.require ();
18599
18600       gcc_assert (cur_mode != BLKmode);
18601
18602       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
18603       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
18604
18605       n -= mode_bits;
18606
18607       /* Do certain trailing copies as overlapping if it's going to be
18608          cheaper.  i.e. less instructions to do so.  For instance doing a 15
18609          byte copy it's more efficient to do two overlapping 8 byte copies than
18610          8 + 6 + 1.  */
18611       if (n > 0 && n <= 8 * BITS_PER_UNIT)
18612         {
18613           next_mode = smallest_mode_for_size (n, MODE_INT);
18614           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
18615           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
18616           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
18617           n = n_bits;
18618         }
18619     }
18620
18621   return true;
18622 }
18623
18624 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
18625    SImode stores.  Handle the case when the constant has identical
18626    bottom and top halves.  This is beneficial when the two stores can be
18627    merged into an STP and we avoid synthesising potentially expensive
18628    immediates twice.  Return true if such a split is possible.  */
18629
18630 bool
18631 aarch64_split_dimode_const_store (rtx dst, rtx src)
18632 {
18633   rtx lo = gen_lowpart (SImode, src);
18634   rtx hi = gen_highpart_mode (SImode, DImode, src);
18635
18636   bool size_p = optimize_function_for_size_p (cfun);
18637
18638   if (!rtx_equal_p (lo, hi))
18639     return false;
18640
18641   unsigned int orig_cost
18642     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
18643   unsigned int lo_cost
18644     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
18645
18646   /* We want to transform:
18647      MOV        x1, 49370
18648      MOVK       x1, 0x140, lsl 16
18649      MOVK       x1, 0xc0da, lsl 32
18650      MOVK       x1, 0x140, lsl 48
18651      STR        x1, [x0]
18652    into:
18653      MOV        w1, 49370
18654      MOVK       w1, 0x140, lsl 16
18655      STP        w1, w1, [x0]
18656    So we want to perform this only when we save two instructions
18657    or more.  When optimizing for size, however, accept any code size
18658    savings we can.  */
18659   if (size_p && orig_cost <= lo_cost)
18660     return false;
18661
18662   if (!size_p
18663       && (orig_cost <= lo_cost + 1))
18664     return false;
18665
18666   rtx mem_lo = adjust_address (dst, SImode, 0);
18667   if (!aarch64_mem_pair_operand (mem_lo, SImode))
18668     return false;
18669
18670   rtx tmp_reg = gen_reg_rtx (SImode);
18671   aarch64_expand_mov_immediate (tmp_reg, lo);
18672   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
18673   /* Don't emit an explicit store pair as this may not be always profitable.
18674      Let the sched-fusion logic decide whether to merge them.  */
18675   emit_move_insn (mem_lo, tmp_reg);
18676   emit_move_insn (mem_hi, tmp_reg);
18677
18678   return true;
18679 }
18680
18681 /* Generate RTL for a conditional branch with rtx comparison CODE in
18682    mode CC_MODE.  The destination of the unlikely conditional branch
18683    is LABEL_REF.  */
18684
18685 void
18686 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
18687                               rtx label_ref)
18688 {
18689   rtx x;
18690   x = gen_rtx_fmt_ee (code, VOIDmode,
18691                       gen_rtx_REG (cc_mode, CC_REGNUM),
18692                       const0_rtx);
18693
18694   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18695                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
18696                             pc_rtx);
18697   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18698 }
18699
18700 /* Generate DImode scratch registers for 128-bit (TImode) addition.
18701
18702    OP1 represents the TImode destination operand 1
18703    OP2 represents the TImode destination operand 2
18704    LOW_DEST represents the low half (DImode) of TImode operand 0
18705    LOW_IN1 represents the low half (DImode) of TImode operand 1
18706    LOW_IN2 represents the low half (DImode) of TImode operand 2
18707    HIGH_DEST represents the high half (DImode) of TImode operand 0
18708    HIGH_IN1 represents the high half (DImode) of TImode operand 1
18709    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
18710
18711 void
18712 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18713                             rtx *low_in1, rtx *low_in2,
18714                             rtx *high_dest, rtx *high_in1,
18715                             rtx *high_in2)
18716 {
18717   *low_dest = gen_reg_rtx (DImode);
18718   *low_in1 = gen_lowpart (DImode, op1);
18719   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18720                                   subreg_lowpart_offset (DImode, TImode));
18721   *high_dest = gen_reg_rtx (DImode);
18722   *high_in1 = gen_highpart (DImode, op1);
18723   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18724                                    subreg_highpart_offset (DImode, TImode));
18725 }
18726
18727 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
18728
18729    This function differs from 'arch64_addti_scratch_regs' in that
18730    OP1 can be an immediate constant (zero). We must call
18731    subreg_highpart_offset with DImode and TImode arguments, otherwise
18732    VOIDmode will be used for the const_int which generates an internal
18733    error from subreg_size_highpart_offset which does not expect a size of zero.
18734
18735    OP1 represents the TImode destination operand 1
18736    OP2 represents the TImode destination operand 2
18737    LOW_DEST represents the low half (DImode) of TImode operand 0
18738    LOW_IN1 represents the low half (DImode) of TImode operand 1
18739    LOW_IN2 represents the low half (DImode) of TImode operand 2
18740    HIGH_DEST represents the high half (DImode) of TImode operand 0
18741    HIGH_IN1 represents the high half (DImode) of TImode operand 1
18742    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
18743
18744
18745 void
18746 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18747                              rtx *low_in1, rtx *low_in2,
18748                              rtx *high_dest, rtx *high_in1,
18749                              rtx *high_in2)
18750 {
18751   *low_dest = gen_reg_rtx (DImode);
18752   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
18753                                   subreg_lowpart_offset (DImode, TImode));
18754
18755   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18756                                   subreg_lowpart_offset (DImode, TImode));
18757   *high_dest = gen_reg_rtx (DImode);
18758
18759   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
18760                                    subreg_highpart_offset (DImode, TImode));
18761   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18762                                    subreg_highpart_offset (DImode, TImode));
18763 }
18764
18765 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
18766
18767    OP0 represents the TImode destination operand 0
18768    LOW_DEST represents the low half (DImode) of TImode operand 0
18769    LOW_IN1 represents the low half (DImode) of TImode operand 1
18770    LOW_IN2 represents the low half (DImode) of TImode operand 2
18771    HIGH_DEST represents the high half (DImode) of TImode operand 0
18772    HIGH_IN1 represents the high half (DImode) of TImode operand 1
18773    HIGH_IN2 represents the high half (DImode) of TImode operand 2
18774    UNSIGNED_P is true if the operation is being performed on unsigned
18775    values.  */
18776 void
18777 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
18778                        rtx low_in2, rtx high_dest, rtx high_in1,
18779                        rtx high_in2, bool unsigned_p)
18780 {
18781   if (low_in2 == const0_rtx)
18782     {
18783       low_dest = low_in1;
18784       high_in2 = force_reg (DImode, high_in2);
18785       if (unsigned_p)
18786         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
18787       else
18788         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
18789     }
18790   else
18791     {
18792       if (CONST_INT_P (low_in2))
18793         {
18794           high_in2 = force_reg (DImode, high_in2);
18795           emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
18796                                               GEN_INT (-INTVAL (low_in2))));
18797         }
18798       else
18799         emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
18800
18801       if (unsigned_p)
18802         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
18803       else
18804         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
18805     }
18806
18807   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
18808   emit_move_insn (gen_highpart (DImode, op0), high_dest);
18809
18810 }
18811
18812 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
18813
18814 static unsigned HOST_WIDE_INT
18815 aarch64_asan_shadow_offset (void)
18816 {
18817   if (TARGET_ILP32)
18818     return (HOST_WIDE_INT_1 << 29);
18819   else
18820     return (HOST_WIDE_INT_1 << 36);
18821 }
18822
18823 static rtx
18824 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
18825                         int code, tree treeop0, tree treeop1)
18826 {
18827   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18828   rtx op0, op1;
18829   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18830   insn_code icode;
18831   struct expand_operand ops[4];
18832
18833   start_sequence ();
18834   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18835
18836   op_mode = GET_MODE (op0);
18837   if (op_mode == VOIDmode)
18838     op_mode = GET_MODE (op1);
18839
18840   switch (op_mode)
18841     {
18842     case E_QImode:
18843     case E_HImode:
18844     case E_SImode:
18845       cmp_mode = SImode;
18846       icode = CODE_FOR_cmpsi;
18847       break;
18848
18849     case E_DImode:
18850       cmp_mode = DImode;
18851       icode = CODE_FOR_cmpdi;
18852       break;
18853
18854     case E_SFmode:
18855       cmp_mode = SFmode;
18856       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18857       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
18858       break;
18859
18860     case E_DFmode:
18861       cmp_mode = DFmode;
18862       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18863       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
18864       break;
18865
18866     default:
18867       end_sequence ();
18868       return NULL_RTX;
18869     }
18870
18871   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
18872   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
18873   if (!op0 || !op1)
18874     {
18875       end_sequence ();
18876       return NULL_RTX;
18877     }
18878   *prep_seq = get_insns ();
18879   end_sequence ();
18880
18881   create_fixed_operand (&ops[0], op0);
18882   create_fixed_operand (&ops[1], op1);
18883
18884   start_sequence ();
18885   if (!maybe_expand_insn (icode, 2, ops))
18886     {
18887       end_sequence ();
18888       return NULL_RTX;
18889     }
18890   *gen_seq = get_insns ();
18891   end_sequence ();
18892
18893   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
18894                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
18895 }
18896
18897 static rtx
18898 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
18899                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
18900 {
18901   rtx op0, op1, target;
18902   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18903   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18904   insn_code icode;
18905   struct expand_operand ops[6];
18906   int aarch64_cond;
18907
18908   push_to_sequence (*prep_seq);
18909   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18910
18911   op_mode = GET_MODE (op0);
18912   if (op_mode == VOIDmode)
18913     op_mode = GET_MODE (op1);
18914
18915   switch (op_mode)
18916     {
18917     case E_QImode:
18918     case E_HImode:
18919     case E_SImode:
18920       cmp_mode = SImode;
18921       icode = CODE_FOR_ccmpsi;
18922       break;
18923
18924     case E_DImode:
18925       cmp_mode = DImode;
18926       icode = CODE_FOR_ccmpdi;
18927       break;
18928
18929     case E_SFmode:
18930       cmp_mode = SFmode;
18931       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
18932       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
18933       break;
18934
18935     case E_DFmode:
18936       cmp_mode = DFmode;
18937       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
18938       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
18939       break;
18940
18941     default:
18942       end_sequence ();
18943       return NULL_RTX;
18944     }
18945
18946   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
18947   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
18948   if (!op0 || !op1)
18949     {
18950       end_sequence ();
18951       return NULL_RTX;
18952     }
18953   *prep_seq = get_insns ();
18954   end_sequence ();
18955
18956   target = gen_rtx_REG (cc_mode, CC_REGNUM);
18957   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
18958
18959   if (bit_code != AND)
18960     {
18961       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
18962                                                 GET_MODE (XEXP (prev, 0))),
18963                              VOIDmode, XEXP (prev, 0), const0_rtx);
18964       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
18965     }
18966
18967   create_fixed_operand (&ops[0], XEXP (prev, 0));
18968   create_fixed_operand (&ops[1], target);
18969   create_fixed_operand (&ops[2], op0);
18970   create_fixed_operand (&ops[3], op1);
18971   create_fixed_operand (&ops[4], prev);
18972   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
18973
18974   push_to_sequence (*gen_seq);
18975   if (!maybe_expand_insn (icode, 6, ops))
18976     {
18977       end_sequence ();
18978       return NULL_RTX;
18979     }
18980
18981   *gen_seq = get_insns ();
18982   end_sequence ();
18983
18984   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
18985 }
18986
18987 #undef TARGET_GEN_CCMP_FIRST
18988 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
18989
18990 #undef TARGET_GEN_CCMP_NEXT
18991 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
18992
18993 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
18994    instruction fusion of some sort.  */
18995
18996 static bool
18997 aarch64_macro_fusion_p (void)
18998 {
18999   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
19000 }
19001
19002
19003 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
19004    should be kept together during scheduling.  */
19005
19006 static bool
19007 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
19008 {
19009   rtx set_dest;
19010   rtx prev_set = single_set (prev);
19011   rtx curr_set = single_set (curr);
19012   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
19013   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
19014
19015   if (!aarch64_macro_fusion_p ())
19016     return false;
19017
19018   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
19019     {
19020       /* We are trying to match:
19021          prev (mov)  == (set (reg r0) (const_int imm16))
19022          curr (movk) == (set (zero_extract (reg r0)
19023                                            (const_int 16)
19024                                            (const_int 16))
19025                              (const_int imm16_1))  */
19026
19027       set_dest = SET_DEST (curr_set);
19028
19029       if (GET_CODE (set_dest) == ZERO_EXTRACT
19030           && CONST_INT_P (SET_SRC (curr_set))
19031           && CONST_INT_P (SET_SRC (prev_set))
19032           && CONST_INT_P (XEXP (set_dest, 2))
19033           && INTVAL (XEXP (set_dest, 2)) == 16
19034           && REG_P (XEXP (set_dest, 0))
19035           && REG_P (SET_DEST (prev_set))
19036           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
19037         {
19038           return true;
19039         }
19040     }
19041
19042   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
19043     {
19044
19045       /*  We're trying to match:
19046           prev (adrp) == (set (reg r1)
19047                               (high (symbol_ref ("SYM"))))
19048           curr (add) == (set (reg r0)
19049                              (lo_sum (reg r1)
19050                                      (symbol_ref ("SYM"))))
19051           Note that r0 need not necessarily be the same as r1, especially
19052           during pre-regalloc scheduling.  */
19053
19054       if (satisfies_constraint_Ush (SET_SRC (prev_set))
19055           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
19056         {
19057           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
19058               && REG_P (XEXP (SET_SRC (curr_set), 0))
19059               && REGNO (XEXP (SET_SRC (curr_set), 0))
19060                  == REGNO (SET_DEST (prev_set))
19061               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
19062                               XEXP (SET_SRC (curr_set), 1)))
19063             return true;
19064         }
19065     }
19066
19067   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
19068     {
19069
19070       /* We're trying to match:
19071          prev (movk) == (set (zero_extract (reg r0)
19072                                            (const_int 16)
19073                                            (const_int 32))
19074                              (const_int imm16_1))
19075          curr (movk) == (set (zero_extract (reg r0)
19076                                            (const_int 16)
19077                                            (const_int 48))
19078                              (const_int imm16_2))  */
19079
19080       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
19081           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
19082           && REG_P (XEXP (SET_DEST (prev_set), 0))
19083           && REG_P (XEXP (SET_DEST (curr_set), 0))
19084           && REGNO (XEXP (SET_DEST (prev_set), 0))
19085              == REGNO (XEXP (SET_DEST (curr_set), 0))
19086           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
19087           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
19088           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
19089           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
19090           && CONST_INT_P (SET_SRC (prev_set))
19091           && CONST_INT_P (SET_SRC (curr_set)))
19092         return true;
19093
19094     }
19095   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
19096     {
19097       /* We're trying to match:
19098           prev (adrp) == (set (reg r0)
19099                               (high (symbol_ref ("SYM"))))
19100           curr (ldr) == (set (reg r1)
19101                              (mem (lo_sum (reg r0)
19102                                              (symbol_ref ("SYM")))))
19103                  or
19104           curr (ldr) == (set (reg r1)
19105                              (zero_extend (mem
19106                                            (lo_sum (reg r0)
19107                                                    (symbol_ref ("SYM"))))))  */
19108       if (satisfies_constraint_Ush (SET_SRC (prev_set))
19109           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
19110         {
19111           rtx curr_src = SET_SRC (curr_set);
19112
19113           if (GET_CODE (curr_src) == ZERO_EXTEND)
19114             curr_src = XEXP (curr_src, 0);
19115
19116           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
19117               && REG_P (XEXP (XEXP (curr_src, 0), 0))
19118               && REGNO (XEXP (XEXP (curr_src, 0), 0))
19119                  == REGNO (SET_DEST (prev_set))
19120               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
19121                               XEXP (SET_SRC (prev_set), 0)))
19122               return true;
19123         }
19124     }
19125
19126   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
19127       && any_condjump_p (curr))
19128     {
19129       unsigned int condreg1, condreg2;
19130       rtx cc_reg_1;
19131       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
19132       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
19133
19134       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
19135           && prev
19136           && modified_in_p (cc_reg_1, prev))
19137         {
19138           enum attr_type prev_type = get_attr_type (prev);
19139
19140           /* FIXME: this misses some which is considered simple arthematic
19141              instructions for ThunderX.  Simple shifts are missed here.  */
19142           if (prev_type == TYPE_ALUS_SREG
19143               || prev_type == TYPE_ALUS_IMM
19144               || prev_type == TYPE_LOGICS_REG
19145               || prev_type == TYPE_LOGICS_IMM)
19146             return true;
19147         }
19148     }
19149
19150   if (prev_set
19151       && curr_set
19152       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
19153       && any_condjump_p (curr))
19154     {
19155       /* We're trying to match:
19156           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
19157           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
19158                                                          (const_int 0))
19159                                                  (label_ref ("SYM"))
19160                                                  (pc))  */
19161       if (SET_DEST (curr_set) == (pc_rtx)
19162           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
19163           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
19164           && REG_P (SET_DEST (prev_set))
19165           && REGNO (SET_DEST (prev_set))
19166              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
19167         {
19168           /* Fuse ALU operations followed by conditional branch instruction.  */
19169           switch (get_attr_type (prev))
19170             {
19171             case TYPE_ALU_IMM:
19172             case TYPE_ALU_SREG:
19173             case TYPE_ADC_REG:
19174             case TYPE_ADC_IMM:
19175             case TYPE_ADCS_REG:
19176             case TYPE_ADCS_IMM:
19177             case TYPE_LOGIC_REG:
19178             case TYPE_LOGIC_IMM:
19179             case TYPE_CSEL:
19180             case TYPE_ADR:
19181             case TYPE_MOV_IMM:
19182             case TYPE_SHIFT_REG:
19183             case TYPE_SHIFT_IMM:
19184             case TYPE_BFM:
19185             case TYPE_RBIT:
19186             case TYPE_REV:
19187             case TYPE_EXTEND:
19188               return true;
19189
19190             default:;
19191             }
19192         }
19193     }
19194
19195   return false;
19196 }
19197
19198 /* Return true iff the instruction fusion described by OP is enabled.  */
19199
19200 bool
19201 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
19202 {
19203   return (aarch64_tune_params.fusible_ops & op) != 0;
19204 }
19205
19206 /* If MEM is in the form of [base+offset], extract the two parts
19207    of address and set to BASE and OFFSET, otherwise return false
19208    after clearing BASE and OFFSET.  */
19209
19210 bool
19211 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
19212 {
19213   rtx addr;
19214
19215   gcc_assert (MEM_P (mem));
19216
19217   addr = XEXP (mem, 0);
19218
19219   if (REG_P (addr))
19220     {
19221       *base = addr;
19222       *offset = const0_rtx;
19223       return true;
19224     }
19225
19226   if (GET_CODE (addr) == PLUS
19227       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
19228     {
19229       *base = XEXP (addr, 0);
19230       *offset = XEXP (addr, 1);
19231       return true;
19232     }
19233
19234   *base = NULL_RTX;
19235   *offset = NULL_RTX;
19236
19237   return false;
19238 }
19239
19240 /* Types for scheduling fusion.  */
19241 enum sched_fusion_type
19242 {
19243   SCHED_FUSION_NONE = 0,
19244   SCHED_FUSION_LD_SIGN_EXTEND,
19245   SCHED_FUSION_LD_ZERO_EXTEND,
19246   SCHED_FUSION_LD,
19247   SCHED_FUSION_ST,
19248   SCHED_FUSION_NUM
19249 };
19250
19251 /* If INSN is a load or store of address in the form of [base+offset],
19252    extract the two parts and set to BASE and OFFSET.  Return scheduling
19253    fusion type this INSN is.  */
19254
19255 static enum sched_fusion_type
19256 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
19257 {
19258   rtx x, dest, src;
19259   enum sched_fusion_type fusion = SCHED_FUSION_LD;
19260
19261   gcc_assert (INSN_P (insn));
19262   x = PATTERN (insn);
19263   if (GET_CODE (x) != SET)
19264     return SCHED_FUSION_NONE;
19265
19266   src = SET_SRC (x);
19267   dest = SET_DEST (x);
19268
19269   machine_mode dest_mode = GET_MODE (dest);
19270
19271   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
19272     return SCHED_FUSION_NONE;
19273
19274   if (GET_CODE (src) == SIGN_EXTEND)
19275     {
19276       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
19277       src = XEXP (src, 0);
19278       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
19279         return SCHED_FUSION_NONE;
19280     }
19281   else if (GET_CODE (src) == ZERO_EXTEND)
19282     {
19283       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
19284       src = XEXP (src, 0);
19285       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
19286         return SCHED_FUSION_NONE;
19287     }
19288
19289   if (GET_CODE (src) == MEM && REG_P (dest))
19290     extract_base_offset_in_addr (src, base, offset);
19291   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
19292     {
19293       fusion = SCHED_FUSION_ST;
19294       extract_base_offset_in_addr (dest, base, offset);
19295     }
19296   else
19297     return SCHED_FUSION_NONE;
19298
19299   if (*base == NULL_RTX || *offset == NULL_RTX)
19300     fusion = SCHED_FUSION_NONE;
19301
19302   return fusion;
19303 }
19304
19305 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
19306
19307    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
19308    and PRI are only calculated for these instructions.  For other instruction,
19309    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
19310    type instruction fusion can be added by returning different priorities.
19311
19312    It's important that irrelevant instructions get the largest FUSION_PRI.  */
19313
19314 static void
19315 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
19316                                int *fusion_pri, int *pri)
19317 {
19318   int tmp, off_val;
19319   rtx base, offset;
19320   enum sched_fusion_type fusion;
19321
19322   gcc_assert (INSN_P (insn));
19323
19324   tmp = max_pri - 1;
19325   fusion = fusion_load_store (insn, &base, &offset);
19326   if (fusion == SCHED_FUSION_NONE)
19327     {
19328       *pri = tmp;
19329       *fusion_pri = tmp;
19330       return;
19331     }
19332
19333   /* Set FUSION_PRI according to fusion type and base register.  */
19334   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
19335
19336   /* Calculate PRI.  */
19337   tmp /= 2;
19338
19339   /* INSN with smaller offset goes first.  */
19340   off_val = (int)(INTVAL (offset));
19341   if (off_val >= 0)
19342     tmp -= (off_val & 0xfffff);
19343   else
19344     tmp += ((- off_val) & 0xfffff);
19345
19346   *pri = tmp;
19347   return;
19348 }
19349
19350 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
19351    Adjust priority of sha1h instructions so they are scheduled before
19352    other SHA1 instructions.  */
19353
19354 static int
19355 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
19356 {
19357   rtx x = PATTERN (insn);
19358
19359   if (GET_CODE (x) == SET)
19360     {
19361       x = SET_SRC (x);
19362
19363       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
19364         return priority + 10;
19365     }
19366
19367   return priority;
19368 }
19369
19370 /* Given OPERANDS of consecutive load/store, check if we can merge
19371    them into ldp/stp.  LOAD is true if they are load instructions.
19372    MODE is the mode of memory operands.  */
19373
19374 bool
19375 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
19376                                 machine_mode mode)
19377 {
19378   HOST_WIDE_INT offval_1, offval_2, msize;
19379   enum reg_class rclass_1, rclass_2;
19380   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
19381
19382   if (load)
19383     {
19384       mem_1 = operands[1];
19385       mem_2 = operands[3];
19386       reg_1 = operands[0];
19387       reg_2 = operands[2];
19388       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
19389       if (REGNO (reg_1) == REGNO (reg_2))
19390         return false;
19391     }
19392   else
19393     {
19394       mem_1 = operands[0];
19395       mem_2 = operands[2];
19396       reg_1 = operands[1];
19397       reg_2 = operands[3];
19398     }
19399
19400   /* The mems cannot be volatile.  */
19401   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
19402     return false;
19403
19404   /* If we have SImode and slow unaligned ldp,
19405      check the alignment to be at least 8 byte. */
19406   if (mode == SImode
19407       && (aarch64_tune_params.extra_tuning_flags
19408           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19409       && !optimize_size
19410       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
19411     return false;
19412
19413   /* Check if the addresses are in the form of [base+offset].  */
19414   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
19415   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
19416     return false;
19417   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
19418   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
19419     return false;
19420
19421   /* Check if the bases are same.  */
19422   if (!rtx_equal_p (base_1, base_2))
19423     return false;
19424
19425   /* The operands must be of the same size.  */
19426   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
19427                          GET_MODE_SIZE (GET_MODE (mem_2))));
19428
19429   offval_1 = INTVAL (offset_1);
19430   offval_2 = INTVAL (offset_2);
19431   /* We should only be trying this for fixed-sized modes.  There is no
19432      SVE LDP/STP instruction.  */
19433   msize = GET_MODE_SIZE (mode).to_constant ();
19434   /* Check if the offsets are consecutive.  */
19435   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
19436     return false;
19437
19438   /* Check if the addresses are clobbered by load.  */
19439   if (load)
19440     {
19441       if (reg_mentioned_p (reg_1, mem_1))
19442         return false;
19443
19444       /* In increasing order, the last load can clobber the address.  */
19445       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
19446         return false;
19447     }
19448
19449   /* One of the memory accesses must be a mempair operand.
19450      If it is not the first one, they need to be swapped by the
19451      peephole.  */
19452   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
19453        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
19454     return false;
19455
19456   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
19457     rclass_1 = FP_REGS;
19458   else
19459     rclass_1 = GENERAL_REGS;
19460
19461   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
19462     rclass_2 = FP_REGS;
19463   else
19464     rclass_2 = GENERAL_REGS;
19465
19466   /* Check if the registers are of same class.  */
19467   if (rclass_1 != rclass_2)
19468     return false;
19469
19470   return true;
19471 }
19472
19473 /* Given OPERANDS of consecutive load/store that can be merged,
19474    swap them if they are not in ascending order.  */
19475 void
19476 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
19477 {
19478   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
19479   HOST_WIDE_INT offval_1, offval_2;
19480
19481   if (load)
19482     {
19483       mem_1 = operands[1];
19484       mem_2 = operands[3];
19485     }
19486   else
19487     {
19488       mem_1 = operands[0];
19489       mem_2 = operands[2];
19490     }
19491
19492   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
19493   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
19494
19495   offval_1 = INTVAL (offset_1);
19496   offval_2 = INTVAL (offset_2);
19497
19498   if (offval_1 > offval_2)
19499     {
19500       /* Irrespective of whether this is a load or a store,
19501          we do the same swap.  */
19502       std::swap (operands[0], operands[2]);
19503       std::swap (operands[1], operands[3]);
19504     }
19505 }
19506
19507 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
19508    comparison between the two.  */
19509 int
19510 aarch64_host_wide_int_compare (const void *x, const void *y)
19511 {
19512   return wi::cmps (* ((const HOST_WIDE_INT *) x),
19513                    * ((const HOST_WIDE_INT *) y));
19514 }
19515
19516 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
19517    other pointing to a REG rtx containing an offset, compare the offsets
19518    of the two pairs.
19519
19520    Return:
19521
19522         1 iff offset (X) > offset (Y)
19523         0 iff offset (X) == offset (Y)
19524         -1 iff offset (X) < offset (Y)  */
19525 int
19526 aarch64_ldrstr_offset_compare (const void *x, const void *y)
19527 {
19528   const rtx * operands_1 = (const rtx *) x;
19529   const rtx * operands_2 = (const rtx *) y;
19530   rtx mem_1, mem_2, base, offset_1, offset_2;
19531
19532   if (MEM_P (operands_1[0]))
19533     mem_1 = operands_1[0];
19534   else
19535     mem_1 = operands_1[1];
19536
19537   if (MEM_P (operands_2[0]))
19538     mem_2 = operands_2[0];
19539   else
19540     mem_2 = operands_2[1];
19541
19542   /* Extract the offsets.  */
19543   extract_base_offset_in_addr (mem_1, &base, &offset_1);
19544   extract_base_offset_in_addr (mem_2, &base, &offset_2);
19545
19546   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
19547
19548   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
19549 }
19550
19551 /* Given OPERANDS of consecutive load/store, check if we can merge
19552    them into ldp/stp by adjusting the offset.  LOAD is true if they
19553    are load instructions.  MODE is the mode of memory operands.
19554
19555    Given below consecutive stores:
19556
19557      str  w1, [xb, 0x100]
19558      str  w1, [xb, 0x104]
19559      str  w1, [xb, 0x108]
19560      str  w1, [xb, 0x10c]
19561
19562    Though the offsets are out of the range supported by stp, we can
19563    still pair them after adjusting the offset, like:
19564
19565      add  scratch, xb, 0x100
19566      stp  w1, w1, [scratch]
19567      stp  w1, w1, [scratch, 0x8]
19568
19569    The peephole patterns detecting this opportunity should guarantee
19570    the scratch register is avaliable.  */
19571
19572 bool
19573 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
19574                                        scalar_mode mode)
19575 {
19576   const int num_insns = 4;
19577   enum reg_class rclass;
19578   HOST_WIDE_INT offvals[num_insns], msize;
19579   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
19580
19581   if (load)
19582     {
19583       for (int i = 0; i < num_insns; i++)
19584         {
19585           reg[i] = operands[2 * i];
19586           mem[i] = operands[2 * i + 1];
19587
19588           gcc_assert (REG_P (reg[i]));
19589         }
19590
19591       /* Do not attempt to merge the loads if the loads clobber each other.  */
19592       for (int i = 0; i < 8; i += 2)
19593         for (int j = i + 2; j < 8; j += 2)
19594           if (reg_overlap_mentioned_p (operands[i], operands[j]))
19595             return false;
19596     }
19597   else
19598     for (int i = 0; i < num_insns; i++)
19599       {
19600         mem[i] = operands[2 * i];
19601         reg[i] = operands[2 * i + 1];
19602       }
19603
19604   /* Skip if memory operand is by itself valid for ldp/stp.  */
19605   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
19606     return false;
19607
19608   for (int i = 0; i < num_insns; i++)
19609     {
19610       /* The mems cannot be volatile.  */
19611       if (MEM_VOLATILE_P (mem[i]))
19612         return false;
19613
19614       /* Check if the addresses are in the form of [base+offset].  */
19615       extract_base_offset_in_addr (mem[i], base + i, offset + i);
19616       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
19617         return false;
19618     }
19619
19620   /* Check if the registers are of same class.  */
19621   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
19622     ? FP_REGS : GENERAL_REGS;
19623
19624   for (int i = 1; i < num_insns; i++)
19625     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
19626       {
19627         if (rclass != FP_REGS)
19628           return false;
19629       }
19630     else
19631       {
19632         if (rclass != GENERAL_REGS)
19633           return false;
19634       }
19635
19636   /* Only the last register in the order in which they occur
19637      may be clobbered by the load.  */
19638   if (rclass == GENERAL_REGS && load)
19639     for (int i = 0; i < num_insns - 1; i++)
19640       if (reg_mentioned_p (reg[i], mem[i]))
19641         return false;
19642
19643   /* Check if the bases are same.  */
19644   for (int i = 0; i < num_insns - 1; i++)
19645     if (!rtx_equal_p (base[i], base[i + 1]))
19646       return false;
19647
19648   for (int i = 0; i < num_insns; i++)
19649     offvals[i] = INTVAL (offset[i]);
19650
19651   msize = GET_MODE_SIZE (mode);
19652
19653   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
19654   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
19655          aarch64_host_wide_int_compare);
19656
19657   if (!(offvals[1] == offvals[0] + msize
19658         && offvals[3] == offvals[2] + msize))
19659     return false;
19660
19661   /* Check that offsets are within range of each other.  The ldp/stp
19662      instructions have 7 bit immediate offsets, so use 0x80.  */
19663   if (offvals[2] - offvals[0] >= msize * 0x80)
19664     return false;
19665
19666   /* The offsets must be aligned with respect to each other.  */
19667   if (offvals[0] % msize != offvals[2] % msize)
19668     return false;
19669
19670   /* If we have SImode and slow unaligned ldp,
19671      check the alignment to be at least 8 byte. */
19672   if (mode == SImode
19673       && (aarch64_tune_params.extra_tuning_flags
19674           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19675       && !optimize_size
19676       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
19677     return false;
19678
19679   return true;
19680 }
19681
19682 /* Given OPERANDS of consecutive load/store, this function pairs them
19683    into LDP/STP after adjusting the offset.  It depends on the fact
19684    that the operands can be sorted so the offsets are correct for STP.
19685    MODE is the mode of memory operands.  CODE is the rtl operator
19686    which should be applied to all memory operands, it's SIGN_EXTEND,
19687    ZERO_EXTEND or UNKNOWN.  */
19688
19689 bool
19690 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
19691                              scalar_mode mode, RTX_CODE code)
19692 {
19693   rtx base, offset_1, offset_3, t1, t2;
19694   rtx mem_1, mem_2, mem_3, mem_4;
19695   rtx temp_operands[8];
19696   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
19697                 stp_off_upper_limit, stp_off_lower_limit, msize;
19698
19699   /* We make changes on a copy as we may still bail out.  */
19700   for (int i = 0; i < 8; i ++)
19701     temp_operands[i] = operands[i];
19702
19703   /* Sort the operands.  */
19704   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
19705
19706   /* Copy the memory operands so that if we have to bail for some
19707      reason the original addresses are unchanged.  */
19708   if (load)
19709     {
19710       mem_1 = copy_rtx (temp_operands[1]);
19711       mem_2 = copy_rtx (temp_operands[3]);
19712       mem_3 = copy_rtx (temp_operands[5]);
19713       mem_4 = copy_rtx (temp_operands[7]);
19714     }
19715   else
19716     {
19717       mem_1 = copy_rtx (temp_operands[0]);
19718       mem_2 = copy_rtx (temp_operands[2]);
19719       mem_3 = copy_rtx (temp_operands[4]);
19720       mem_4 = copy_rtx (temp_operands[6]);
19721       gcc_assert (code == UNKNOWN);
19722     }
19723
19724   extract_base_offset_in_addr (mem_1, &base, &offset_1);
19725   extract_base_offset_in_addr (mem_3, &base, &offset_3);
19726   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
19727               && offset_3 != NULL_RTX);
19728
19729   /* Adjust offset so it can fit in LDP/STP instruction.  */
19730   msize = GET_MODE_SIZE (mode);
19731   stp_off_upper_limit = msize * (0x40 - 1);
19732   stp_off_lower_limit = - msize * 0x40;
19733
19734   off_val_1 = INTVAL (offset_1);
19735   off_val_3 = INTVAL (offset_3);
19736
19737   /* The base offset is optimally half way between the two STP/LDP offsets.  */
19738   if (msize <= 4)
19739     base_off = (off_val_1 + off_val_3) / 2;
19740   else
19741     /* However, due to issues with negative LDP/STP offset generation for
19742        larger modes, for DF, DI and vector modes. we must not use negative
19743        addresses smaller than 9 signed unadjusted bits can store.  This
19744        provides the most range in this case.  */
19745     base_off = off_val_1;
19746
19747   /* Adjust the base so that it is aligned with the addresses but still
19748      optimal.  */
19749   if (base_off % msize != off_val_1 % msize)
19750     /* Fix the offset, bearing in mind we want to make it bigger not
19751        smaller.  */
19752     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19753   else if (msize <= 4)
19754     /* The negative range of LDP/STP is one larger than the positive range.  */
19755     base_off += msize;
19756
19757   /* Check if base offset is too big or too small.  We can attempt to resolve
19758      this issue by setting it to the maximum value and seeing if the offsets
19759      still fit.  */
19760   if (base_off >= 0x1000)
19761     {
19762       base_off = 0x1000 - 1;
19763       /* We must still make sure that the base offset is aligned with respect
19764          to the address.  But it may may not be made any bigger.  */
19765       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19766     }
19767
19768   /* Likewise for the case where the base is too small.  */
19769   if (base_off <= -0x1000)
19770     {
19771       base_off = -0x1000 + 1;
19772       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19773     }
19774
19775   /* Offset of the first STP/LDP.  */
19776   new_off_1 = off_val_1 - base_off;
19777
19778   /* Offset of the second STP/LDP.  */
19779   new_off_3 = off_val_3 - base_off;
19780
19781   /* The offsets must be within the range of the LDP/STP instructions.  */
19782   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
19783       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
19784     return false;
19785
19786   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
19787                                                   new_off_1), true);
19788   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
19789                                                   new_off_1 + msize), true);
19790   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
19791                                                   new_off_3), true);
19792   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
19793                                                   new_off_3 + msize), true);
19794
19795   if (!aarch64_mem_pair_operand (mem_1, mode)
19796       || !aarch64_mem_pair_operand (mem_3, mode))
19797     return false;
19798
19799   if (code == ZERO_EXTEND)
19800     {
19801       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
19802       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
19803       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
19804       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
19805     }
19806   else if (code == SIGN_EXTEND)
19807     {
19808       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
19809       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
19810       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
19811       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
19812     }
19813
19814   if (load)
19815     {
19816       operands[0] = temp_operands[0];
19817       operands[1] = mem_1;
19818       operands[2] = temp_operands[2];
19819       operands[3] = mem_2;
19820       operands[4] = temp_operands[4];
19821       operands[5] = mem_3;
19822       operands[6] = temp_operands[6];
19823       operands[7] = mem_4;
19824     }
19825   else
19826     {
19827       operands[0] = mem_1;
19828       operands[1] = temp_operands[1];
19829       operands[2] = mem_2;
19830       operands[3] = temp_operands[3];
19831       operands[4] = mem_3;
19832       operands[5] = temp_operands[5];
19833       operands[6] = mem_4;
19834       operands[7] = temp_operands[7];
19835     }
19836
19837   /* Emit adjusting instruction.  */
19838   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
19839   /* Emit ldp/stp instructions.  */
19840   t1 = gen_rtx_SET (operands[0], operands[1]);
19841   t2 = gen_rtx_SET (operands[2], operands[3]);
19842   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19843   t1 = gen_rtx_SET (operands[4], operands[5]);
19844   t2 = gen_rtx_SET (operands[6], operands[7]);
19845   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19846   return true;
19847 }
19848
19849 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
19850    it isn't worth branching around empty masked ops (including masked
19851    stores).  */
19852
19853 static bool
19854 aarch64_empty_mask_is_expensive (unsigned)
19855 {
19856   return false;
19857 }
19858
19859 /* Return 1 if pseudo register should be created and used to hold
19860    GOT address for PIC code.  */
19861
19862 bool
19863 aarch64_use_pseudo_pic_reg (void)
19864 {
19865   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
19866 }
19867
19868 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
19869
19870 static int
19871 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
19872 {
19873   switch (XINT (x, 1))
19874     {
19875     case UNSPEC_GOTSMALLPIC:
19876     case UNSPEC_GOTSMALLPIC28K:
19877     case UNSPEC_GOTTINYPIC:
19878       return 0;
19879     default:
19880       break;
19881     }
19882
19883   return default_unspec_may_trap_p (x, flags);
19884 }
19885
19886
19887 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
19888    return the log2 of that value.  Otherwise return -1.  */
19889
19890 int
19891 aarch64_fpconst_pow_of_2 (rtx x)
19892 {
19893   const REAL_VALUE_TYPE *r;
19894
19895   if (!CONST_DOUBLE_P (x))
19896     return -1;
19897
19898   r = CONST_DOUBLE_REAL_VALUE (x);
19899
19900   if (REAL_VALUE_NEGATIVE (*r)
19901       || REAL_VALUE_ISNAN (*r)
19902       || REAL_VALUE_ISINF (*r)
19903       || !real_isinteger (r, DFmode))
19904     return -1;
19905
19906   return exact_log2 (real_to_integer (r));
19907 }
19908
19909 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
19910    power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
19911    return n. Otherwise return -1.  */
19912
19913 int
19914 aarch64_fpconst_pow2_recip (rtx x)
19915 {
19916   REAL_VALUE_TYPE r0;
19917
19918   if (!CONST_DOUBLE_P (x))
19919     return -1;
19920
19921   r0 = *CONST_DOUBLE_REAL_VALUE (x);
19922   if (exact_real_inverse (DFmode, &r0)
19923       && !REAL_VALUE_NEGATIVE (r0))
19924     {
19925         int ret = exact_log2 (real_to_integer (&r0));
19926         if (ret >= 1 && ret <= 32)
19927             return ret;
19928     }
19929   return -1;
19930 }
19931
19932 /* If X is a vector of equal CONST_DOUBLE values and that value is
19933    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
19934
19935 int
19936 aarch64_vec_fpconst_pow_of_2 (rtx x)
19937 {
19938   int nelts;
19939   if (GET_CODE (x) != CONST_VECTOR
19940       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
19941     return -1;
19942
19943   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
19944     return -1;
19945
19946   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
19947   if (firstval <= 0)
19948     return -1;
19949
19950   for (int i = 1; i < nelts; i++)
19951     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
19952       return -1;
19953
19954   return firstval;
19955 }
19956
19957 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
19958    to float.
19959
19960    __fp16 always promotes through this hook.
19961    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
19962    through the generic excess precision logic rather than here.  */
19963
19964 static tree
19965 aarch64_promoted_type (const_tree t)
19966 {
19967   if (SCALAR_FLOAT_TYPE_P (t)
19968       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
19969     return float_type_node;
19970
19971   return NULL_TREE;
19972 }
19973
19974 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
19975
19976 static bool
19977 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
19978                            optimization_type opt_type)
19979 {
19980   switch (op)
19981     {
19982     case rsqrt_optab:
19983       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
19984
19985     default:
19986       return true;
19987     }
19988 }
19989
19990 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
19991
19992 static unsigned int
19993 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
19994                                         int *offset)
19995 {
19996   /* Polynomial invariant 1 == (VG / 2) - 1.  */
19997   gcc_assert (i == 1);
19998   *factor = 2;
19999   *offset = 1;
20000   return AARCH64_DWARF_VG;
20001 }
20002
20003 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
20004    if MODE is HFmode, and punt to the generic implementation otherwise.  */
20005
20006 static bool
20007 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
20008 {
20009   return (mode == HFmode
20010           ? true
20011           : default_libgcc_floating_mode_supported_p (mode));
20012 }
20013
20014 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
20015    if MODE is HFmode, and punt to the generic implementation otherwise.  */
20016
20017 static bool
20018 aarch64_scalar_mode_supported_p (scalar_mode mode)
20019 {
20020   return (mode == HFmode
20021           ? true
20022           : default_scalar_mode_supported_p (mode));
20023 }
20024
20025 /* Set the value of FLT_EVAL_METHOD.
20026    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
20027
20028     0: evaluate all operations and constants, whose semantic type has at
20029        most the range and precision of type float, to the range and
20030        precision of float; evaluate all other operations and constants to
20031        the range and precision of the semantic type;
20032
20033     N, where _FloatN is a supported interchange floating type
20034        evaluate all operations and constants, whose semantic type has at
20035        most the range and precision of _FloatN type, to the range and
20036        precision of the _FloatN type; evaluate all other operations and
20037        constants to the range and precision of the semantic type;
20038
20039    If we have the ARMv8.2-A extensions then we support _Float16 in native
20040    precision, so we should set this to 16.  Otherwise, we support the type,
20041    but want to evaluate expressions in float precision, so set this to
20042    0.  */
20043
20044 static enum flt_eval_method
20045 aarch64_excess_precision (enum excess_precision_type type)
20046 {
20047   switch (type)
20048     {
20049       case EXCESS_PRECISION_TYPE_FAST:
20050       case EXCESS_PRECISION_TYPE_STANDARD:
20051         /* We can calculate either in 16-bit range and precision or
20052            32-bit range and precision.  Make that decision based on whether
20053            we have native support for the ARMv8.2-A 16-bit floating-point
20054            instructions or not.  */
20055         return (TARGET_FP_F16INST
20056                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
20057                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
20058       case EXCESS_PRECISION_TYPE_IMPLICIT:
20059         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
20060       default:
20061         gcc_unreachable ();
20062     }
20063   return FLT_EVAL_METHOD_UNPREDICTABLE;
20064 }
20065
20066 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
20067    scheduled for speculative execution.  Reject the long-running division
20068    and square-root instructions.  */
20069
20070 static bool
20071 aarch64_sched_can_speculate_insn (rtx_insn *insn)
20072 {
20073   switch (get_attr_type (insn))
20074     {
20075       case TYPE_SDIV:
20076       case TYPE_UDIV:
20077       case TYPE_FDIVS:
20078       case TYPE_FDIVD:
20079       case TYPE_FSQRTS:
20080       case TYPE_FSQRTD:
20081       case TYPE_NEON_FP_SQRT_S:
20082       case TYPE_NEON_FP_SQRT_D:
20083       case TYPE_NEON_FP_SQRT_S_Q:
20084       case TYPE_NEON_FP_SQRT_D_Q:
20085       case TYPE_NEON_FP_DIV_S:
20086       case TYPE_NEON_FP_DIV_D:
20087       case TYPE_NEON_FP_DIV_S_Q:
20088       case TYPE_NEON_FP_DIV_D_Q:
20089         return false;
20090       default:
20091         return true;
20092     }
20093 }
20094
20095 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
20096
20097 static int
20098 aarch64_compute_pressure_classes (reg_class *classes)
20099 {
20100   int i = 0;
20101   classes[i++] = GENERAL_REGS;
20102   classes[i++] = FP_REGS;
20103   /* PR_REGS isn't a useful pressure class because many predicate pseudo
20104      registers need to go in PR_LO_REGS at some point during their
20105      lifetime.  Splitting it into two halves has the effect of making
20106      all predicates count against PR_LO_REGS, so that we try whenever
20107      possible to restrict the number of live predicates to 8.  This
20108      greatly reduces the amount of spilling in certain loops.  */
20109   classes[i++] = PR_LO_REGS;
20110   classes[i++] = PR_HI_REGS;
20111   return i;
20112 }
20113
20114 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
20115
20116 static bool
20117 aarch64_can_change_mode_class (machine_mode from,
20118                                machine_mode to, reg_class_t)
20119 {
20120   if (BYTES_BIG_ENDIAN)
20121     {
20122       bool from_sve_p = aarch64_sve_data_mode_p (from);
20123       bool to_sve_p = aarch64_sve_data_mode_p (to);
20124
20125       /* Don't allow changes between SVE data modes and non-SVE modes.
20126          See the comment at the head of aarch64-sve.md for details.  */
20127       if (from_sve_p != to_sve_p)
20128         return false;
20129
20130       /* Don't allow changes in element size: lane 0 of the new vector
20131          would not then be lane 0 of the old vector.  See the comment
20132          above aarch64_maybe_expand_sve_subreg_move for a more detailed
20133          description.
20134
20135          In the worst case, this forces a register to be spilled in
20136          one mode and reloaded in the other, which handles the
20137          endianness correctly.  */
20138       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
20139         return false;
20140     }
20141   return true;
20142 }
20143
20144 /* Implement TARGET_EARLY_REMAT_MODES.  */
20145
20146 static void
20147 aarch64_select_early_remat_modes (sbitmap modes)
20148 {
20149   /* SVE values are not normally live across a call, so it should be
20150      worth doing early rematerialization even in VL-specific mode.  */
20151   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
20152     if (aarch64_sve_mode_p ((machine_mode) i))
20153       bitmap_set_bit (modes, i);
20154 }
20155
20156 /* Override the default target speculation_safe_value.  */
20157 static rtx
20158 aarch64_speculation_safe_value (machine_mode mode,
20159                                 rtx result, rtx val, rtx failval)
20160 {
20161   /* Maybe we should warn if falling back to hard barriers.  They are
20162      likely to be noticably more expensive than the alternative below.  */
20163   if (!aarch64_track_speculation)
20164     return default_speculation_safe_value (mode, result, val, failval);
20165
20166   if (!REG_P (val))
20167     val = copy_to_mode_reg (mode, val);
20168
20169   if (!aarch64_reg_or_zero (failval, mode))
20170     failval = copy_to_mode_reg (mode, failval);
20171
20172   emit_insn (gen_despeculate_copy (mode, result, val, failval));
20173   return result;
20174 }
20175
20176 /* Implement TARGET_ESTIMATED_POLY_VALUE.
20177    Look into the tuning structure for an estimate.
20178    VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
20179    Advanced SIMD 128 bits.  */
20180
20181 static HOST_WIDE_INT
20182 aarch64_estimated_poly_value (poly_int64 val)
20183 {
20184   enum aarch64_sve_vector_bits_enum width_source
20185     = aarch64_tune_params.sve_width;
20186
20187   /* If we still don't have an estimate, use the default.  */
20188   if (width_source == SVE_SCALABLE)
20189     return default_estimated_poly_value (val);
20190
20191   HOST_WIDE_INT over_128 = width_source - 128;
20192   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
20193 }
20194
20195
20196 /* Return true for types that could be supported as SIMD return or
20197    argument types.  */
20198
20199 static bool
20200 supported_simd_type (tree t)
20201 {
20202   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
20203     {
20204       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
20205       return s == 1 || s == 2 || s == 4 || s == 8;
20206     }
20207   return false;
20208 }
20209
20210 /* Return true for types that currently are supported as SIMD return
20211    or argument types.  */
20212
20213 static bool
20214 currently_supported_simd_type (tree t, tree b)
20215 {
20216   if (COMPLEX_FLOAT_TYPE_P (t))
20217     return false;
20218
20219   if (TYPE_SIZE (t) != TYPE_SIZE (b))
20220     return false;
20221
20222   return supported_simd_type (t);
20223 }
20224
20225 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
20226
20227 static int
20228 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
20229                                         struct cgraph_simd_clone *clonei,
20230                                         tree base_type, int num)
20231 {
20232   tree t, ret_type, arg_type;
20233   unsigned int elt_bits, vec_bits, count;
20234
20235   if (!TARGET_SIMD)
20236     return 0;
20237
20238   if (clonei->simdlen
20239       && (clonei->simdlen < 2
20240           || clonei->simdlen > 1024
20241           || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
20242     {
20243       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20244                   "unsupported simdlen %d", clonei->simdlen);
20245       return 0;
20246     }
20247
20248   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
20249   if (TREE_CODE (ret_type) != VOID_TYPE
20250       && !currently_supported_simd_type (ret_type, base_type))
20251     {
20252       if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
20253         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20254                     "GCC does not currently support mixed size types "
20255                     "for %<simd%> functions");
20256       else if (supported_simd_type (ret_type))
20257         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20258                     "GCC does not currently support return type %qT "
20259                     "for %<simd%> functions", ret_type);
20260       else
20261         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20262                     "unsupported return type %qT for %<simd%> functions",
20263                     ret_type);
20264       return 0;
20265     }
20266
20267   for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
20268     {
20269       arg_type = TREE_TYPE (t);
20270
20271       if (!currently_supported_simd_type (arg_type, base_type))
20272         {
20273           if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
20274             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20275                         "GCC does not currently support mixed size types "
20276                         "for %<simd%> functions");
20277           else
20278             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20279                         "GCC does not currently support argument type %qT "
20280                         "for %<simd%> functions", arg_type);
20281           return 0;
20282         }
20283     }
20284
20285   clonei->vecsize_mangle = 'n';
20286   clonei->mask_mode = VOIDmode;
20287   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
20288   if (clonei->simdlen == 0)
20289     {
20290       count = 2;
20291       vec_bits = (num == 0 ? 64 : 128);
20292       clonei->simdlen = vec_bits / elt_bits;
20293     }
20294   else
20295     {
20296       count = 1;
20297       vec_bits = clonei->simdlen * elt_bits;
20298       if (vec_bits != 64 && vec_bits != 128)
20299         {
20300           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20301                       "GCC does not currently support simdlen %d for type %qT",
20302                       clonei->simdlen, base_type);
20303           return 0;
20304         }
20305     }
20306   clonei->vecsize_int = vec_bits;
20307   clonei->vecsize_float = vec_bits;
20308   return count;
20309 }
20310
20311 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
20312
20313 static void
20314 aarch64_simd_clone_adjust (struct cgraph_node *node)
20315 {
20316   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
20317      use the correct ABI.  */
20318
20319   tree t = TREE_TYPE (node->decl);
20320   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
20321                                         TYPE_ATTRIBUTES (t));
20322 }
20323
20324 /* Implement TARGET_SIMD_CLONE_USABLE.  */
20325
20326 static int
20327 aarch64_simd_clone_usable (struct cgraph_node *node)
20328 {
20329   switch (node->simdclone->vecsize_mangle)
20330     {
20331     case 'n':
20332       if (!TARGET_SIMD)
20333         return -1;
20334       return 0;
20335     default:
20336       gcc_unreachable ();
20337     }
20338 }
20339
20340 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
20341
20342 static int
20343 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
20344 {
20345   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
20346       != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
20347     return 0;
20348   return 1;
20349 }
20350
20351 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
20352
20353 static const char *
20354 aarch64_get_multilib_abi_name (void)
20355 {
20356   if (TARGET_BIG_END)
20357     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
20358   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
20359 }
20360
20361 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
20362    global variable based guard use the default else
20363    return a null tree.  */
20364 static tree
20365 aarch64_stack_protect_guard (void)
20366 {
20367   if (aarch64_stack_protector_guard == SSP_GLOBAL)
20368     return default_stack_protect_guard ();
20369
20370   return NULL_TREE;
20371 }
20372
20373 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
20374    section at the end if needed.  */
20375 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND      0xc0000000
20376 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI      (1U << 0)
20377 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC      (1U << 1)
20378 void
20379 aarch64_file_end_indicate_exec_stack ()
20380 {
20381   file_end_indicate_exec_stack ();
20382
20383   unsigned feature_1_and = 0;
20384   if (aarch64_bti_enabled ())
20385     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
20386
20387   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
20388     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
20389
20390   if (feature_1_and)
20391     {
20392       /* Generate .note.gnu.property section.  */
20393       switch_to_section (get_section (".note.gnu.property",
20394                                       SECTION_NOTYPE, NULL));
20395
20396       /* PT_NOTE header: namesz, descsz, type.
20397          namesz = 4 ("GNU\0")
20398          descsz = 16 (Size of the program property array)
20399                   [(12 + padding) * Number of array elements]
20400          type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
20401       assemble_align (POINTER_SIZE);
20402       assemble_integer (GEN_INT (4), 4, 32, 1);
20403       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
20404       assemble_integer (GEN_INT (5), 4, 32, 1);
20405
20406       /* PT_NOTE name.  */
20407       assemble_string ("GNU", 4);
20408
20409       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
20410          type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
20411          datasz = 4
20412          data   = feature_1_and.  */
20413       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
20414       assemble_integer (GEN_INT (4), 4, 32, 1);
20415       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
20416
20417       /* Pad the size of the note to the required alignment.  */
20418       assemble_align (POINTER_SIZE);
20419     }
20420 }
20421 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
20422 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
20423 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
20424
20425 /* Target-specific selftests.  */
20426
20427 #if CHECKING_P
20428
20429 namespace selftest {
20430
20431 /* Selftest for the RTL loader.
20432    Verify that the RTL loader copes with a dump from
20433    print_rtx_function.  This is essentially just a test that class
20434    function_reader can handle a real dump, but it also verifies
20435    that lookup_reg_by_dump_name correctly handles hard regs.
20436    The presence of hard reg names in the dump means that the test is
20437    target-specific, hence it is in this file.  */
20438
20439 static void
20440 aarch64_test_loading_full_dump ()
20441 {
20442   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
20443
20444   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
20445
20446   rtx_insn *insn_1 = get_insn_by_uid (1);
20447   ASSERT_EQ (NOTE, GET_CODE (insn_1));
20448
20449   rtx_insn *insn_15 = get_insn_by_uid (15);
20450   ASSERT_EQ (INSN, GET_CODE (insn_15));
20451   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
20452
20453   /* Verify crtl->return_rtx.  */
20454   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
20455   ASSERT_EQ (0, REGNO (crtl->return_rtx));
20456   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
20457 }
20458
20459 /* Run all target-specific selftests.  */
20460
20461 static void
20462 aarch64_run_selftests (void)
20463 {
20464   aarch64_test_loading_full_dump ();
20465 }
20466
20467 } // namespace selftest
20468
20469 #endif /* #if CHECKING_P */
20470
20471 #undef TARGET_STACK_PROTECT_GUARD
20472 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
20473
20474 #undef TARGET_ADDRESS_COST
20475 #define TARGET_ADDRESS_COST aarch64_address_cost
20476
20477 /* This hook will determines whether unnamed bitfields affect the alignment
20478    of the containing structure.  The hook returns true if the structure
20479    should inherit the alignment requirements of an unnamed bitfield's
20480    type.  */
20481 #undef TARGET_ALIGN_ANON_BITFIELD
20482 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
20483
20484 #undef TARGET_ASM_ALIGNED_DI_OP
20485 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
20486
20487 #undef TARGET_ASM_ALIGNED_HI_OP
20488 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
20489
20490 #undef TARGET_ASM_ALIGNED_SI_OP
20491 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
20492
20493 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
20494 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
20495   hook_bool_const_tree_hwi_hwi_const_tree_true
20496
20497 #undef TARGET_ASM_FILE_START
20498 #define TARGET_ASM_FILE_START aarch64_start_file
20499
20500 #undef TARGET_ASM_OUTPUT_MI_THUNK
20501 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
20502
20503 #undef TARGET_ASM_SELECT_RTX_SECTION
20504 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
20505
20506 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
20507 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
20508
20509 #undef TARGET_BUILD_BUILTIN_VA_LIST
20510 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
20511
20512 #undef TARGET_CALLEE_COPIES
20513 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
20514
20515 #undef TARGET_CAN_ELIMINATE
20516 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
20517
20518 #undef TARGET_CAN_INLINE_P
20519 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
20520
20521 #undef TARGET_CANNOT_FORCE_CONST_MEM
20522 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
20523
20524 #undef TARGET_CASE_VALUES_THRESHOLD
20525 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
20526
20527 #undef TARGET_CONDITIONAL_REGISTER_USAGE
20528 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
20529
20530 /* Only the least significant bit is used for initialization guard
20531    variables.  */
20532 #undef TARGET_CXX_GUARD_MASK_BIT
20533 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
20534
20535 #undef TARGET_C_MODE_FOR_SUFFIX
20536 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
20537
20538 #ifdef TARGET_BIG_ENDIAN_DEFAULT
20539 #undef  TARGET_DEFAULT_TARGET_FLAGS
20540 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
20541 #endif
20542
20543 #undef TARGET_CLASS_MAX_NREGS
20544 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
20545
20546 #undef TARGET_BUILTIN_DECL
20547 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
20548
20549 #undef TARGET_BUILTIN_RECIPROCAL
20550 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
20551
20552 #undef TARGET_C_EXCESS_PRECISION
20553 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
20554
20555 #undef  TARGET_EXPAND_BUILTIN
20556 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
20557
20558 #undef TARGET_EXPAND_BUILTIN_VA_START
20559 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
20560
20561 #undef TARGET_FOLD_BUILTIN
20562 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
20563
20564 #undef TARGET_FUNCTION_ARG
20565 #define TARGET_FUNCTION_ARG aarch64_function_arg
20566
20567 #undef TARGET_FUNCTION_ARG_ADVANCE
20568 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
20569
20570 #undef TARGET_FUNCTION_ARG_BOUNDARY
20571 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
20572
20573 #undef TARGET_FUNCTION_ARG_PADDING
20574 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
20575
20576 #undef TARGET_GET_RAW_RESULT_MODE
20577 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
20578 #undef TARGET_GET_RAW_ARG_MODE
20579 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
20580
20581 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
20582 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
20583
20584 #undef TARGET_FUNCTION_VALUE
20585 #define TARGET_FUNCTION_VALUE aarch64_function_value
20586
20587 #undef TARGET_FUNCTION_VALUE_REGNO_P
20588 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
20589
20590 #undef TARGET_GIMPLE_FOLD_BUILTIN
20591 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
20592
20593 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
20594 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
20595
20596 #undef  TARGET_INIT_BUILTINS
20597 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
20598
20599 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
20600 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
20601   aarch64_ira_change_pseudo_allocno_class
20602
20603 #undef TARGET_LEGITIMATE_ADDRESS_P
20604 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
20605
20606 #undef TARGET_LEGITIMATE_CONSTANT_P
20607 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
20608
20609 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
20610 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
20611   aarch64_legitimize_address_displacement
20612
20613 #undef TARGET_LIBGCC_CMP_RETURN_MODE
20614 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
20615
20616 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
20617 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
20618 aarch64_libgcc_floating_mode_supported_p
20619
20620 #undef TARGET_MANGLE_TYPE
20621 #define TARGET_MANGLE_TYPE aarch64_mangle_type
20622
20623 #undef TARGET_MEMORY_MOVE_COST
20624 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
20625
20626 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
20627 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
20628
20629 #undef TARGET_MUST_PASS_IN_STACK
20630 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
20631
20632 /* This target hook should return true if accesses to volatile bitfields
20633    should use the narrowest mode possible.  It should return false if these
20634    accesses should use the bitfield container type.  */
20635 #undef TARGET_NARROW_VOLATILE_BITFIELD
20636 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
20637
20638 #undef  TARGET_OPTION_OVERRIDE
20639 #define TARGET_OPTION_OVERRIDE aarch64_override_options
20640
20641 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
20642 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
20643   aarch64_override_options_after_change
20644
20645 #undef TARGET_OPTION_SAVE
20646 #define TARGET_OPTION_SAVE aarch64_option_save
20647
20648 #undef TARGET_OPTION_RESTORE
20649 #define TARGET_OPTION_RESTORE aarch64_option_restore
20650
20651 #undef TARGET_OPTION_PRINT
20652 #define TARGET_OPTION_PRINT aarch64_option_print
20653
20654 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
20655 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
20656
20657 #undef TARGET_SET_CURRENT_FUNCTION
20658 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
20659
20660 #undef TARGET_PASS_BY_REFERENCE
20661 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
20662
20663 #undef TARGET_PREFERRED_RELOAD_CLASS
20664 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
20665
20666 #undef TARGET_SCHED_REASSOCIATION_WIDTH
20667 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
20668
20669 #undef TARGET_PROMOTED_TYPE
20670 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
20671
20672 #undef TARGET_SECONDARY_RELOAD
20673 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
20674
20675 #undef TARGET_SHIFT_TRUNCATION_MASK
20676 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
20677
20678 #undef TARGET_SETUP_INCOMING_VARARGS
20679 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
20680
20681 #undef TARGET_STRUCT_VALUE_RTX
20682 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
20683
20684 #undef TARGET_REGISTER_MOVE_COST
20685 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
20686
20687 #undef TARGET_RETURN_IN_MEMORY
20688 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
20689
20690 #undef TARGET_RETURN_IN_MSB
20691 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
20692
20693 #undef TARGET_RTX_COSTS
20694 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
20695
20696 #undef TARGET_SCALAR_MODE_SUPPORTED_P
20697 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
20698
20699 #undef TARGET_SCHED_ISSUE_RATE
20700 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
20701
20702 #undef TARGET_SCHED_VARIABLE_ISSUE
20703 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
20704
20705 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
20706 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
20707   aarch64_sched_first_cycle_multipass_dfa_lookahead
20708
20709 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
20710 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
20711   aarch64_first_cycle_multipass_dfa_lookahead_guard
20712
20713 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
20714 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
20715   aarch64_get_separate_components
20716
20717 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
20718 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
20719   aarch64_components_for_bb
20720
20721 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
20722 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
20723   aarch64_disqualify_components
20724
20725 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
20726 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
20727   aarch64_emit_prologue_components
20728
20729 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
20730 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
20731   aarch64_emit_epilogue_components
20732
20733 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
20734 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
20735   aarch64_set_handled_components
20736
20737 #undef TARGET_TRAMPOLINE_INIT
20738 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
20739
20740 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
20741 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
20742
20743 #undef TARGET_VECTOR_MODE_SUPPORTED_P
20744 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
20745
20746 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
20747 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
20748   aarch64_builtin_support_vector_misalignment
20749
20750 #undef TARGET_ARRAY_MODE
20751 #define TARGET_ARRAY_MODE aarch64_array_mode
20752
20753 #undef TARGET_ARRAY_MODE_SUPPORTED_P
20754 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
20755
20756 #undef TARGET_VECTORIZE_ADD_STMT_COST
20757 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
20758
20759 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
20760 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
20761   aarch64_builtin_vectorization_cost
20762
20763 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
20764 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
20765
20766 #undef TARGET_VECTORIZE_BUILTINS
20767 #define TARGET_VECTORIZE_BUILTINS
20768
20769 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
20770 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
20771   aarch64_builtin_vectorized_function
20772
20773 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
20774 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
20775   aarch64_autovectorize_vector_sizes
20776
20777 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
20778 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
20779   aarch64_atomic_assign_expand_fenv
20780
20781 /* Section anchor support.  */
20782
20783 #undef TARGET_MIN_ANCHOR_OFFSET
20784 #define TARGET_MIN_ANCHOR_OFFSET -256
20785
20786 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
20787    byte offset; we can do much more for larger data types, but have no way
20788    to determine the size of the access.  We assume accesses are aligned.  */
20789 #undef TARGET_MAX_ANCHOR_OFFSET
20790 #define TARGET_MAX_ANCHOR_OFFSET 4095
20791
20792 #undef TARGET_VECTOR_ALIGNMENT
20793 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
20794
20795 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
20796 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
20797   aarch64_vectorize_preferred_vector_alignment
20798 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
20799 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
20800   aarch64_simd_vector_alignment_reachable
20801
20802 /* vec_perm support.  */
20803
20804 #undef TARGET_VECTORIZE_VEC_PERM_CONST
20805 #define TARGET_VECTORIZE_VEC_PERM_CONST \
20806   aarch64_vectorize_vec_perm_const
20807
20808 #undef TARGET_VECTORIZE_GET_MASK_MODE
20809 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
20810 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
20811 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
20812   aarch64_empty_mask_is_expensive
20813 #undef TARGET_PREFERRED_ELSE_VALUE
20814 #define TARGET_PREFERRED_ELSE_VALUE \
20815   aarch64_preferred_else_value
20816
20817 #undef TARGET_INIT_LIBFUNCS
20818 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
20819
20820 #undef TARGET_FIXED_CONDITION_CODE_REGS
20821 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
20822
20823 #undef TARGET_FLAGS_REGNUM
20824 #define TARGET_FLAGS_REGNUM CC_REGNUM
20825
20826 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
20827 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
20828
20829 #undef TARGET_ASAN_SHADOW_OFFSET
20830 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
20831
20832 #undef TARGET_LEGITIMIZE_ADDRESS
20833 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
20834
20835 #undef TARGET_SCHED_CAN_SPECULATE_INSN
20836 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
20837
20838 #undef TARGET_CAN_USE_DOLOOP_P
20839 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
20840
20841 #undef TARGET_SCHED_ADJUST_PRIORITY
20842 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
20843
20844 #undef TARGET_SCHED_MACRO_FUSION_P
20845 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
20846
20847 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
20848 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
20849
20850 #undef TARGET_SCHED_FUSION_PRIORITY
20851 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
20852
20853 #undef TARGET_UNSPEC_MAY_TRAP_P
20854 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
20855
20856 #undef TARGET_USE_PSEUDO_PIC_REG
20857 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
20858
20859 #undef TARGET_PRINT_OPERAND
20860 #define TARGET_PRINT_OPERAND aarch64_print_operand
20861
20862 #undef TARGET_PRINT_OPERAND_ADDRESS
20863 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
20864
20865 #undef TARGET_OPTAB_SUPPORTED_P
20866 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
20867
20868 #undef TARGET_OMIT_STRUCT_RETURN_REG
20869 #define TARGET_OMIT_STRUCT_RETURN_REG true
20870
20871 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
20872 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
20873   aarch64_dwarf_poly_indeterminate_value
20874
20875 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
20876 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
20877 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
20878
20879 #undef TARGET_HARD_REGNO_NREGS
20880 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
20881 #undef TARGET_HARD_REGNO_MODE_OK
20882 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
20883
20884 #undef TARGET_MODES_TIEABLE_P
20885 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
20886
20887 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
20888 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
20889   aarch64_hard_regno_call_part_clobbered
20890
20891 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
20892 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
20893   aarch64_remove_extra_call_preserved_regs
20894
20895 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
20896 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
20897   aarch64_return_call_with_max_clobbers
20898
20899 #undef TARGET_CONSTANT_ALIGNMENT
20900 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
20901
20902 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
20903 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
20904   aarch64_stack_clash_protection_alloca_probe_range
20905
20906 #undef TARGET_COMPUTE_PRESSURE_CLASSES
20907 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
20908
20909 #undef TARGET_CAN_CHANGE_MODE_CLASS
20910 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
20911
20912 #undef TARGET_SELECT_EARLY_REMAT_MODES
20913 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
20914
20915 #undef TARGET_SPECULATION_SAFE_VALUE
20916 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
20917
20918 #undef TARGET_ESTIMATED_POLY_VALUE
20919 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
20920
20921 #undef TARGET_ATTRIBUTE_TABLE
20922 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
20923
20924 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
20925 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
20926   aarch64_simd_clone_compute_vecsize_and_simdlen
20927
20928 #undef TARGET_SIMD_CLONE_ADJUST
20929 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
20930
20931 #undef TARGET_SIMD_CLONE_USABLE
20932 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
20933
20934 #undef TARGET_COMP_TYPE_ATTRIBUTES
20935 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
20936
20937 #undef TARGET_GET_MULTILIB_ABI_NAME
20938 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
20939
20940 #if CHECKING_P
20941 #undef TARGET_RUN_TARGET_SELFTESTS
20942 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
20943 #endif /* #if CHECKING_P */
20944
20945 #undef TARGET_ASM_POST_CFI_STARTPROC
20946 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
20947
20948 struct gcc_target targetm = TARGET_INITIALIZER;
20949
20950 #include "gt-aarch64.h"